From b4ac552a0803e698076329c257bec70eb7505e3d Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Sat, 5 Jul 2025 05:57:55 +0000
Subject: [PATCH 01/12] [CGP]: Optimize mul.overflow.

- Detect cases where LHS & RHS values will not cause overflow
(when the Hi parts are zero).
- Detect cases where either of LHS or RHS values could not cause overflow
(when one of the Hi parts is zero).
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  573 +++
 llvm/test/CodeGen/AArch64/i128-math.ll        |  504 +-
 .../CodeGen/AArch64/i128_with_overflow.ll     |  198 +-
 .../umulo-128-legalisation-lowering.ll        |  205 +-
 .../ARM/umulo-128-legalisation-lowering.ll    |  579 ++-
 .../ARM/umulo-64-legalisation-lowering.ll     |  107 +-
 .../CodeGen/LoongArch/smul-with-overflow.ll   |  985 +++-
 .../umulo-128-legalisation-lowering.ll        |  439 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |  355 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              | 2857 ++++++++++--
 .../SPARC/smulo-128-legalisation-lowering.ll  | 1255 ++++-
 .../SPARC/umulo-128-legalisation-lowering.ll  |  605 ++-
 .../Thumb/umulo-128-legalisation-lowering.ll  |  654 ++-
 .../Thumb2/umulo-128-legalisation-lowering.ll |  294 +-
 .../Thumb2/umulo-64-legalisation-lowering.ll  |   51 +-
 llvm/test/CodeGen/X86/muloti.ll               |  177 +-
 .../X86/smulo-128-legalisation-lowering.ll    | 4105 +++++++++++++----
 .../X86/umulo-128-legalisation-lowering.ll    |  454 +-
 .../X86/umulo-64-legalisation-lowering.ll     |   85 +-
 llvm/test/CodeGen/X86/xmulo.ll                | 1625 +++++--
 20 files changed, 13125 insertions(+), 2982 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9db4c9e5e2807..238718e471e47 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -431,6 +431,8 @@ class CodeGenPrepare {
   bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
                           unsigned AddrSpace);
   bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
+  bool optimizeUMulWithOverflow(Instruction *I);
+  bool optimizeSMulWithOverflow(Instruction *I);
   bool optimizeInlineAsmInst(CallInst *CS);
   bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
   bool optimizeExt(Instruction *&I);
@@ -2778,6 +2780,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
         }
       }
       return false;
+    case Intrinsic::umul_with_overflow:
+      return optimizeUMulWithOverflow(II);
+    case Intrinsic::smul_with_overflow:
+      return optimizeSMulWithOverflow(II);
     }
 
     SmallVector<Value *, 2> PtrOps;
@@ -6389,6 +6395,573 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
   return true;
 }
 
+// Rewrite the umul_with_overflow intrinsic by checking if any/both of the
+// operands' value range is within the legal type. If so, we can optimize the
+// multiplication algorithm. This code is supposed to be written during the step
+// of type legalization, but given that we need to reconstruct the IR which is
+// not doable there, we do it here.
+bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
+  if (TLI->getTypeAction(
+          I->getContext(),
+          TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
+      TargetLowering::TypeExpandInteger)
+    return false;
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  auto *Ty = LHS->getType();
+  unsigned VTBitWidth = Ty->getScalarSizeInBits();
+  unsigned VTHalfBitWidth = VTBitWidth / 2;
+  auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
+
+  assert(
+      (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) ==
+       TargetLowering::TypeLegal) &&
+      "Expected the type to be legal for the target lowering");
+
+  I->getParent()->setName("overflow.res");
+  auto *OverflowResBB = I->getParent();
+  auto *OverflowoEntryBB =
+      I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
+  BasicBlock *OverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowRHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.rhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowBB = BasicBlock::Create(
+      I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
+  BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
+                                              I->getFunction(), OverflowResBB);
+  // new blocks should be:
+  //  entry:
+  //    lhs_lo ne lhs_hi ? overflow_yes_lhs, overflow_no_lhs
+
+  //  overflow_yes_lhs:
+  //    rhs_lo ne rhs_hi ? overflow : overflow_no_rhs_only
+
+  //  overflow_no_lhs:
+  //    rhs_lo ne rhs_hi ? overflow_no_lhs_only : overflow_no
+
+  //  overflow_no_rhs_only:
+  //  overflow_no_lhs_only:
+  //  overflow_no:
+  //  overflow:
+  //  overflow.res:
+
+  IRBuilder<> BuilderEntryBB(OverflowoEntryBB->getTerminator());
+  IRBuilder<> BuilderOverflowLHSBB(OverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowLHSBB(NoOverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowRHSonlyBB(NoOverflowRHSonlyBB);
+  IRBuilder<> BuilderNoOverflowLHSonlyBB(NoOverflowLHSonlyBB);
+  IRBuilder<> BuilderNoOverflowBB(NoOverflowBB);
+  IRBuilder<> BuilderOverflowResBB(OverflowResBB,
+                                   OverflowResBB->getFirstInsertionPt());
+
+  //------------------------------------------------------------------------------
+  // BB overflow.entry:
+  // get Lo and Hi of RHS & LHS:
+
+  auto *LoRHS = BuilderEntryBB.CreateTrunc(RHS, LegalTy, "lo.rhs.trunc");
+  auto *ShrHiRHS = BuilderEntryBB.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  auto *HiRHS = BuilderEntryBB.CreateTrunc(ShrHiRHS, LegalTy, "hi.rhs.trunc");
+
+  auto *LoLHS = BuilderEntryBB.CreateTrunc(LHS, LegalTy, "lo.lhs.trunc");
+  auto *ShrHiLHS = BuilderEntryBB.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  auto *HiLHS = BuilderEntryBB.CreateTrunc(ShrHiLHS, LegalTy, "hi.lhs.trunc");
+
+  auto *Cmp = BuilderEntryBB.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
+                                       ConstantInt::getNullValue(LegalTy));
+  BuilderEntryBB.CreateCondBr(Cmp, OverflowLHSBB, NoOverflowLHSBB);
+  OverflowoEntryBB->getTerminator()->eraseFromParent();
+
+  //------------------------------------------------------------------------------
+  // BB overflow_yes_lhs:
+  Cmp = BuilderOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+                                       ConstantInt::getNullValue(LegalTy));
+  BuilderOverflowLHSBB.CreateCondBr(Cmp, OverflowBB, NoOverflowRHSonlyBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_lhs:
+  Cmp = BuilderNoOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+                                         ConstantInt::getNullValue(LegalTy));
+  BuilderNoOverflowLHSBB.CreateCondBr(Cmp, NoOverflowLHSonlyBB, NoOverflowBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_rhs_only:
+  // RHS is 64 value range, LHS is 128
+  // P0 = RHS * LoLHS
+  // P1 = RHS * HiLHS
+
+  LoLHS = BuilderNoOverflowRHSonlyBB.CreateZExt(LoLHS, Ty, "lo.lhs");
+
+  // P0 = (RHS * LoLHS)
+  auto *P0 = BuilderNoOverflowRHSonlyBB.CreateMul(RHS, LoLHS,
+                                                  "mul.no.overflow.rhs.lolhs");
+  auto *P0Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.rhs");
+  auto *P0Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.rhs.lsr");
+  P0Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.rhs");
+
+  // P1 = (RHS * HiLHS)
+  auto *P1 = BuilderNoOverflowRHSonlyBB.CreateMul(RHS, ShrHiLHS,
+                                                  "mul.no.overflow.rhs.hilhs");
+  auto *P1Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.rhs");
+  auto *P1Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.rhs.lsr");
+  P1Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.rhs");
+
+  auto *AddOverflow = BuilderNoOverflowRHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  auto *AddOResMid = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 0, "rhs.p0.p1.res");
+  auto *Carry = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 1, "rhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  auto *ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(P1Hi, Carry, "rhs.p1.carry");
+
+  auto *ResLoEx =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(P0Lo, Ty, "rhs.res_lo.zext");
+  auto *ResMid =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(AddOResMid, Ty, "rhs.res_mid.zext");
+  auto *ResMidShl = BuilderNoOverflowRHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                         "rhs.res_mid.shl");
+  auto *FinalRes = BuilderNoOverflowRHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                       "rhs.res_lo.or.mid");
+  auto *IsOverflow = BuilderNoOverflowRHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "rhs.check.overflow");
+
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowRHS = PoisonValue::get(STy);
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, FinalRes, {0});
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, IsOverflow, {1});
+  BuilderNoOverflowRHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow_no_lhs_only:
+
+  LoRHS = BuilderNoOverflowLHSonlyBB.CreateZExt(LoRHS, Ty, "lo.rhs");
+
+  // P0 = (LHS * LoRHS)
+  P0 = BuilderNoOverflowLHSonlyBB.CreateMul(LHS, LoRHS,
+                                            "mul.no.overflow.lhs.lorhs");
+  P0Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.lhs");
+  P0Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.lsr.lhs");
+  P0Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.lhs");
+
+  // P1 = (LHS * HiRHS)
+  P1 = BuilderNoOverflowLHSonlyBB.CreateMul(LHS, ShrHiRHS,
+                                            "mul.no.overflow.lhs.hirhs");
+  P1Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.lhs");
+  P1Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.lhs.lsr");
+  P1Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.lhs");
+
+  AddOverflow = BuilderNoOverflowLHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  AddOResMid = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 0,
+                                                             "lhs.p0.p1.res");
+  Carry = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 1,
+                                                        "lhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResHi = BuilderNoOverflowLHSonlyBB.CreateAdd(P1Hi, Carry, "lhs.p1.carry");
+
+  ResLoEx = BuilderNoOverflowLHSonlyBB.CreateZExt(P0Lo, Ty, "lhs.res_lo.zext");
+  ResMid =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(AddOResMid, Ty, "lhs.res_mid.zext");
+  ResMidShl = BuilderNoOverflowLHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                   "lhs.res_mid.shl");
+  FinalRes = BuilderNoOverflowLHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                 "lhs.res_lo.or.mid");
+  IsOverflow = BuilderNoOverflowLHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "lhs.check.overflow");
+
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowLHS = PoisonValue::get(STy);
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, FinalRes, {0});
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, IsOverflow, {1});
+
+  BuilderNoOverflowLHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow.no:
+  auto *Mul = BuilderNoOverflowBB.CreateMul(LHS, RHS, "mul.no.overflow");
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflow = PoisonValue::get(STy);
+  StructValNoOverflow =
+      BuilderNoOverflowBB.CreateInsertValue(StructValNoOverflow, Mul, {0});
+  StructValNoOverflow = BuilderNoOverflowBB.CreateInsertValue(
+      StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
+  BuilderNoOverflowBB.CreateBr(OverflowResBB);
+
+  // BB overflow.res:
+  auto *PHINode = BuilderOverflowResBB.CreatePHI(STy, 2);
+  PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
+  PHINode->addIncoming(StructValNoOverflowLHS, NoOverflowLHSonlyBB);
+  PHINode->addIncoming(StructValNoOverflowRHS, NoOverflowRHSonlyBB);
+
+  // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
+  // uses by PHINode.
+  I->replaceAllUsesWith(PHINode);
+
+  // BB overflow:
+  PHINode->addIncoming(I, OverflowBB);
+  I->removeFromParent();
+  I->insertInto(OverflowBB, OverflowBB->end());
+  IRBuilder<>(OverflowBB, OverflowBB->end()).CreateBr(OverflowResBB);
+
+  // return false to stop reprocessing the function.
+  return false;
+}
+
+// Rewrite the smul_with_overflow intrinsic by checking if any/both of the
+// operands' value range is within the legal type. If so, we can optimize the
+// multiplication algorithm. This code is supposed to be written during the step
+// of type legalization, but given that we need to reconstruct the IR which is
+// not doable there, we do it here.
+bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
+  if (TLI->getTypeAction(
+          I->getContext(),
+          TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
+      TargetLowering::TypeExpandInteger)
+    return false;
+  Value *LHS = I->getOperand(0);
+  Value *RHS = I->getOperand(1);
+  auto *Ty = LHS->getType();
+  unsigned VTBitWidth = Ty->getScalarSizeInBits();
+  unsigned VTHalfBitWidth = VTBitWidth / 2;
+  auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
+
+  assert(
+      (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) ==
+       TargetLowering::TypeLegal) &&
+      "Expected the type to be legal for the target lowering");
+
+  I->getParent()->setName("overflow.res");
+  auto *OverflowResBB = I->getParent();
+  auto *OverflowoEntryBB =
+      I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
+  BasicBlock *OverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowRHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.rhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowLHSonlyBB = BasicBlock::Create(
+      I->getContext(), "overflow.no.lhs.only", I->getFunction(), OverflowResBB);
+  BasicBlock *NoOverflowBB = BasicBlock::Create(
+      I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
+  BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
+                                              I->getFunction(), OverflowResBB);
+  // new blocks should be:
+  //  entry:
+  //    lhs_lo ne lhs_hi ? overflow_yes_lhs, overflow_no_lhs
+
+  //  overflow_yes_lhs:
+  //    rhs_lo ne rhs_hi ? overflow : overflow_no_rhs_only
+
+  //  overflow_no_lhs:
+  //    rhs_lo ne rhs_hi ? overflow_no_lhs_only : overflow_no
+
+  //  overflow_no_rhs_only:
+  //  overflow_no_lhs_only:
+  //  overflow_no:
+  //  overflow:
+  //  overflow.res:
+
+  IRBuilder<> BuilderEntryBB(OverflowoEntryBB->getTerminator());
+  IRBuilder<> BuilderOverflowLHSBB(OverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowLHSBB(NoOverflowLHSBB);
+  IRBuilder<> BuilderNoOverflowRHSonlyBB(NoOverflowRHSonlyBB);
+  IRBuilder<> BuilderNoOverflowLHSonlyBB(NoOverflowLHSonlyBB);
+  IRBuilder<> BuilderNoOverflowBB(NoOverflowBB);
+  IRBuilder<> BuilderOverflowResBB(OverflowResBB,
+                                   OverflowResBB->getFirstInsertionPt());
+
+  //------------------------------------------------------------------------------
+  // BB overflow.entry:
+  // get Lo and Hi of RHS & LHS:
+
+  auto *LoRHS = BuilderEntryBB.CreateTrunc(RHS, LegalTy, "lo.rhs");
+  auto *SignLoRHS =
+      BuilderEntryBB.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+  auto *HiRHS = BuilderEntryBB.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  HiRHS = BuilderEntryBB.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
+
+  auto *LoLHS = BuilderEntryBB.CreateTrunc(LHS, LegalTy, "lo.lhs");
+  auto *SignLoLHS =
+      BuilderEntryBB.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+  auto *HiLHS = BuilderEntryBB.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  HiLHS = BuilderEntryBB.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
+
+  auto *Cmp = BuilderEntryBB.CreateCmp(ICmpInst::ICMP_NE, HiLHS, SignLoLHS);
+  BuilderEntryBB.CreateCondBr(Cmp, OverflowLHSBB, NoOverflowLHSBB);
+  OverflowoEntryBB->getTerminator()->eraseFromParent();
+
+  //------------------------------------------------------------------------------
+  // BB overflow_yes_lhs:
+  Cmp = BuilderOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
+  BuilderOverflowLHSBB.CreateCondBr(Cmp, OverflowBB, NoOverflowRHSonlyBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_lhs:
+  Cmp = BuilderNoOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
+  BuilderNoOverflowLHSBB.CreateCondBr(Cmp, NoOverflowLHSonlyBB, NoOverflowBB);
+
+  //------------------------------------------------------------------------------
+  // BB overflow_no_rhs_only:
+  // RHS is within 64 value range, LHS is 128
+  // P0 = RHS * LoLHS
+  // P1 = RHS * HiLHS
+
+  // check sign of RHS:
+  auto *IsNegRHS = BuilderNoOverflowRHSonlyBB.CreateIsNeg(RHS, "rhs.isneg");
+  auto *AbsRHSIntr = BuilderNoOverflowRHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, RHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.rhs");
+  auto *AbsRHS = BuilderNoOverflowRHSonlyBB.CreateSelect(
+      IsNegRHS, AbsRHSIntr, RHS, "lo.abs.rhs.select");
+
+  // check sign of LHS:
+  auto *IsNegLHS = BuilderNoOverflowRHSonlyBB.CreateIsNeg(LHS, "lhs.isneg");
+  auto *AbsLHSIntr = BuilderNoOverflowRHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, LHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.lhs");
+  auto *AbsLHS = BuilderNoOverflowRHSonlyBB.CreateSelect(IsNegLHS, AbsLHSIntr,
+                                                         LHS, "abs.lhs.select");
+  LoLHS = BuilderNoOverflowRHSonlyBB.CreateAnd(
+      AbsLHS,
+      ConstantInt::get(Ty, APInt::getLowBitsSet(VTBitWidth, VTHalfBitWidth)),
+      "lo.abs.lhs");
+  HiLHS = BuilderNoOverflowRHSonlyBB.CreateLShr(AbsLHS, VTHalfBitWidth,
+                                                "hi.abs.lhs");
+
+  // P0 = (RHS * LoLHS)
+  auto *P0 = BuilderNoOverflowRHSonlyBB.CreateMul(AbsRHS, LoLHS,
+                                                  "mul.no.overflow.rhs.lolhs");
+  auto *P0Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.rhs");
+  auto *P0Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.rhs.lsr");
+  P0Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.rhs");
+
+  // P1 = (RHS * HiLHS)
+  auto *P1 = BuilderNoOverflowRHSonlyBB.CreateMul(AbsRHS, HiLHS,
+                                                  "mul.no.overflow.rhs.hilhs");
+  auto *P1Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.rhs");
+  auto *P1Hi =
+      BuilderNoOverflowRHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.rhs.lsr");
+  P1Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.rhs");
+
+  auto *AddOverflow = BuilderNoOverflowRHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  auto *AddOResMid = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 0, "rhs.p0.p1.res");
+  auto *Carry = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
+      AddOverflow, 1, "rhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  auto *ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(P1Hi, Carry, "rhs.p1.carry");
+
+  // sign handling:
+  auto *IsNeg = BuilderNoOverflowRHSonlyBB.CreateXor(IsNegRHS, IsNegLHS); // i1
+  auto *Mask =
+      BuilderNoOverflowRHSonlyBB.CreateSExt(IsNeg, LegalTy, "rhs.sign.mask");
+  auto *Add_1 =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(IsNeg, LegalTy, "rhs.add.1");
+  auto *ResLo =
+      BuilderNoOverflowRHSonlyBB.CreateXor(P0Lo, Mask, "rhs.res_lo.xor.mask");
+  ResLo =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(ResLo, Add_1, "rhs.res_lo.add.1");
+
+  Carry = BuilderNoOverflowRHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResLo, Add_1,
+                                               "rhs.check.res_lo.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  auto *ResMid = BuilderNoOverflowRHSonlyBB.CreateXor(AddOResMid, Mask,
+                                                      "rhs.res_mid.xor.mask");
+  ResMid =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(ResMid, Carry, "rhs.res_mid.carry");
+
+  Carry = BuilderNoOverflowRHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResMid,
+                                               Carry, "rhs.check.reslo.carry");
+  Carry =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
+  ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateXor(ResHi, Mask, "rhs.res_hi.xor.mask");
+  ResHi =
+      BuilderNoOverflowRHSonlyBB.CreateAdd(ResHi, Carry, "rhs.res_hi.carry");
+  // set the final result:
+  auto *ResLoEx =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(ResLo, Ty, "rhs.res_lo.zext");
+  ResMid =
+      BuilderNoOverflowRHSonlyBB.CreateZExt(ResMid, Ty, "rhs.res_mid.zext");
+  auto *ResMidShl = BuilderNoOverflowRHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                         "rhs.res_mid.shl");
+  auto *FinalRes = BuilderNoOverflowRHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                       "rhs.res_lo.or.mid");
+  auto *IsOverflow = BuilderNoOverflowRHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "rhs.check.overflow");
+
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowRHS = PoisonValue::get(STy);
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, FinalRes, {0});
+  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
+      StructValNoOverflowRHS, IsOverflow, {1});
+  BuilderNoOverflowRHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow_no_lhs_only:
+  // LHS (64), RHS is 128
+  // P0 = LHS * LoRHS
+  // P1 = LHS * HiRHS
+
+  // check sign of LHS:
+  IsNegLHS = BuilderNoOverflowLHSonlyBB.CreateIsNeg(LHS, "lhs.isneg");
+  AbsLHSIntr = BuilderNoOverflowLHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, LHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.lhs");
+  AbsLHS = BuilderNoOverflowLHSonlyBB.CreateSelect(IsNegLHS, AbsLHSIntr, LHS,
+                                                   "abs.lhs.select");
+
+  // check sign of RHS:
+  IsNegRHS = BuilderNoOverflowLHSonlyBB.CreateIsNeg(RHS, "rhs.isneg");
+  AbsRHSIntr = BuilderNoOverflowLHSonlyBB.CreateBinaryIntrinsic(
+      Intrinsic::abs, RHS, ConstantInt::getFalse(I->getContext()), {},
+      "abs.rhs");
+  AbsRHS = BuilderNoOverflowLHSonlyBB.CreateSelect(IsNegRHS, AbsRHSIntr, RHS,
+                                                   "abs.rhs.select");
+
+  LoRHS = BuilderNoOverflowLHSonlyBB.CreateAnd(
+      AbsRHS,
+      ConstantInt::get(Ty, APInt::getLowBitsSet(VTBitWidth, VTHalfBitWidth)),
+      "lo.abs.rhs");
+  HiRHS = BuilderNoOverflowLHSonlyBB.CreateLShr(AbsRHS, VTHalfBitWidth,
+                                                "hi.abs.rhs");
+
+  // P0 = (LHS * LoRHS)
+  P0 = BuilderNoOverflowLHSonlyBB.CreateMul(AbsLHS, LoRHS,
+                                            "mul.no.overflow.lhs.lorhs");
+  P0Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.lhs");
+  P0Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.lsr.lhs");
+  P0Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.lhs");
+
+  // P1 = (LHS * HiRHS)
+  P1 = BuilderNoOverflowLHSonlyBB.CreateMul(AbsLHS, HiRHS,
+                                            "mul.no.overflow.lhs.hirhs");
+  P1Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.lhs");
+  P1Hi =
+      BuilderNoOverflowLHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.lhs.lsr");
+  P1Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.lhs");
+
+  AddOverflow = BuilderNoOverflowLHSonlyBB.CreateIntrinsic(
+      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
+  AddOResMid = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 0,
+                                                             "lhs.p0.p1.res");
+  Carry = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 1,
+                                                        "lhs.p0.p1.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResHi = BuilderNoOverflowLHSonlyBB.CreateAdd(P1Hi, Carry, "lhs.p1.carry");
+
+  // sign handling:
+  IsNeg = BuilderNoOverflowLHSonlyBB.CreateXor(IsNegRHS, IsNegLHS); // i1
+  Mask = BuilderNoOverflowLHSonlyBB.CreateSExt(IsNeg, LegalTy, "lhs.sign.mask");
+  Add_1 = BuilderNoOverflowLHSonlyBB.CreateZExt(IsNeg, LegalTy, "lhs.add.1");
+  ResLo =
+      BuilderNoOverflowLHSonlyBB.CreateXor(P0Lo, Mask, "lhs.res_lo.xor.mask");
+  ResLo =
+      BuilderNoOverflowLHSonlyBB.CreateAdd(ResLo, Add_1, "lhs.res_lo.add.1");
+
+  Carry = BuilderNoOverflowLHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResLo, Add_1,
+                                               "lhs.check.res_lo.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResMid = BuilderNoOverflowLHSonlyBB.CreateXor(AddOResMid, Mask,
+                                                "lhs.res_mid.xor.mask");
+  ResMid =
+      BuilderNoOverflowLHSonlyBB.CreateAdd(ResMid, Carry, "lhs.res_mid.carry");
+
+  Carry = BuilderNoOverflowLHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResMid,
+                                               Carry, "lhs.check.reslo.carry");
+  Carry =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
+  ResHi =
+      BuilderNoOverflowLHSonlyBB.CreateXor(ResHi, Mask, "lhs.res_hi.xor.mask");
+  ResHi =
+      BuilderNoOverflowLHSonlyBB.CreateAdd(ResHi, Carry, "lhs.res_hi.carry");
+  // Set the final result:
+  ResLoEx = BuilderNoOverflowLHSonlyBB.CreateZExt(ResLo, Ty, "lhs.res_lo.zext");
+  ResMid =
+      BuilderNoOverflowLHSonlyBB.CreateZExt(ResMid, Ty, "lhs.res_mid.zext");
+  ResMidShl = BuilderNoOverflowLHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
+                                                   "lhs.res_mid.shl");
+  FinalRes = BuilderNoOverflowLHSonlyBB.CreateOr(ResLoEx, ResMidShl,
+                                                 "lhs.res_lo.or.mid");
+  IsOverflow = BuilderNoOverflowLHSonlyBB.CreateICmp(
+      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
+      "lhs.check.overflow");
+
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflowLHS = PoisonValue::get(STy);
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, FinalRes, {0});
+  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
+      StructValNoOverflowLHS, IsOverflow, {1});
+
+  BuilderNoOverflowLHSonlyBB.CreateBr(OverflowResBB);
+  //------------------------------------------------------------------------------
+
+  // BB overflow.no:
+  auto *Mul = BuilderNoOverflowBB.CreateMul(LHS, RHS, "mul.no.overflow");
+  STy = StructType::get(I->getContext(),
+                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValNoOverflow = PoisonValue::get(STy);
+  StructValNoOverflow =
+      BuilderNoOverflowBB.CreateInsertValue(StructValNoOverflow, Mul, {0});
+  StructValNoOverflow = BuilderNoOverflowBB.CreateInsertValue(
+      StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
+  BuilderNoOverflowBB.CreateBr(OverflowResBB);
+
+  // BB overflow.res:
+  auto *PHINode = BuilderOverflowResBB.CreatePHI(STy, 2);
+  PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
+  PHINode->addIncoming(StructValNoOverflowLHS, NoOverflowLHSonlyBB);
+  PHINode->addIncoming(StructValNoOverflowRHS, NoOverflowRHSonlyBB);
+
+  // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
+  // uses by PHINode.
+  I->replaceAllUsesWith(PHINode);
+
+  // BB overflow:
+  PHINode->addIncoming(I, OverflowBB);
+  I->removeFromParent();
+  I->insertInto(OverflowBB, OverflowBB->end());
+  IRBuilder<>(OverflowBB, OverflowBB->end()).CreateBr(OverflowResBB);
+
+  // return false to stop reprocessing the function.
+  return false;
+}
+
 /// If there are any memory operands, use OptimizeMemoryInst to sink their
 /// address computing into the block when possible / profitable.
 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 9e1c0c1b115ab..e2791f44d0a08 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -261,21 +261,55 @@ define i128 @u128_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_checked_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB17_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB17_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    b .LBB17_8
+; CHECK-NEXT:  .LBB17_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB17_7
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x8, x1, x2, x8
+; CHECK-NEXT:    umulh x9, x0, x3
+; CHECK-NEXT:    mul x10, x0, x3
+; CHECK-NEXT:    mul x11, x1, x3
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    b .LBB17_6
+; CHECK-NEXT:  .LBB17_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x9, x2, x1
+; CHECK-NEXT:    madd x8, x3, x0, x8
+; CHECK-NEXT:    mul x10, x2, x1
+; CHECK-NEXT:    mul x11, x3, x1
+; CHECK-NEXT:    mul x0, x2, x0
+; CHECK-NEXT:  .LBB17_6: // %overflow.res
+; CHECK-NEXT:    adds x1, x8, x10
+; CHECK-NEXT:    adcs xzr, x9, x11
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    b .LBB17_8
+; CHECK-NEXT:  .LBB17_7: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB17_8: // %overflow.res
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    bic w2, w9, w8
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -289,20 +323,54 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_overflowing_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB18_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB18_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
-; CHECK-NEXT:    csinc w2, w8, wzr, lo
+; CHECK-NEXT:    csinc w8, w8, wzr, lo
+; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB18_7
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x8, x1, x2, x8
+; CHECK-NEXT:    umulh x9, x0, x3
+; CHECK-NEXT:    mul x10, x0, x3
+; CHECK-NEXT:    mul x11, x1, x3
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    b .LBB18_6
+; CHECK-NEXT:  .LBB18_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x9, x2, x1
+; CHECK-NEXT:    madd x8, x3, x0, x8
+; CHECK-NEXT:    mul x10, x2, x1
+; CHECK-NEXT:    mul x11, x3, x1
+; CHECK-NEXT:    mul x0, x2, x0
+; CHECK-NEXT:  .LBB18_6: // %overflow.res
+; CHECK-NEXT:    adds x1, x8, x10
+; CHECK-NEXT:    adcs xzr, x9, x11
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB18_7: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    and w2, wzr, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -315,21 +383,54 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 
 define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_saturating_mul:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul x9, x3, x0
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB19_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB19_5
+; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    mul x8, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
-; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x9, x3, x0
+; CHECK-NEXT:    madd x11, x1, x2, x8
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x12, x0, x2
+; CHECK-NEXT:    ccmp xzr, x9, #0, eq
 ; CHECK-NEXT:    mul x8, x0, x2
 ; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    adds x9, x11, x9
+; CHECK-NEXT:    adds x9, x12, x11
 ; CHECK-NEXT:    csinc w10, w10, wzr, lo
-; CHECK-NEXT:    cmp w10, #0
+; CHECK-NEXT:    b .LBB19_8
+; CHECK-NEXT:  .LBB19_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB19_7
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x9, x1, x2, x8
+; CHECK-NEXT:    umulh x10, x0, x3
+; CHECK-NEXT:    mul x11, x0, x3
+; CHECK-NEXT:    mul x12, x1, x3
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    b .LBB19_6
+; CHECK-NEXT:  .LBB19_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x10, x2, x1
+; CHECK-NEXT:    madd x9, x3, x0, x8
+; CHECK-NEXT:    mul x11, x2, x1
+; CHECK-NEXT:    mul x12, x3, x1
+; CHECK-NEXT:    mul x8, x2, x0
+; CHECK-NEXT:  .LBB19_6: // %overflow.res
+; CHECK-NEXT:    adds x9, x9, x11
+; CHECK-NEXT:    adcs xzr, x10, x12
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    b .LBB19_8
+; CHECK-NEXT:  .LBB19_7: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    madd x9, x1, x2, x8
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB19_8: // %overflow.res
+; CHECK-NEXT:    tst w10, #0x1
 ; CHECK-NEXT:    csinv x0, x8, xzr, eq
 ; CHECK-NEXT:    csinv x1, x9, xzr, eq
 ; CHECK-NEXT:    ret
@@ -354,7 +455,14 @@ define i128 @i128_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_checked_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB21_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB21_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -364,24 +472,106 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    adds x9, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
 ; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    mov x1, x9
+; CHECK-NEXT:    adc x9, x12, x13
 ; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
-; CHECK-NEXT:    asr x11, x8, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w2, eq
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x10, x10, x11
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    b .LBB21_7
+; CHECK-NEXT:  .LBB21_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB21_8
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB21_6
+; CHECK-NEXT:  .LBB21_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB21_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x8, x8, x11
+; CHECK-NEXT:    adc x9, x9, x15
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    eor x8, x8, x12
+; CHECK-NEXT:    add x0, x13, x10
+; CHECK-NEXT:    cmp x0, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x1, x8, lo
+; CHECK-NEXT:    eor x8, x9, x12
+; CHECK-NEXT:    cmp x1, x10
+; CHECK-NEXT:    cinc x8, x8, lo
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:  .LBB21_7: // %overflow.res
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    b .LBB21_9
+; CHECK-NEXT:  .LBB21_8: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:  .LBB21_9: // %overflow.res
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    bic w2, w9, w8
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -395,7 +585,14 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_overflowing_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB22_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB22_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -405,24 +602,104 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x9, x8, x9
+; CHECK-NEXT:    adc x8, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x8, x14, x10
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    adds x9, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
 ; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x8
-; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    mov x1, x9
+; CHECK-NEXT:    adc x9, x12, x13
 ; CHECK-NEXT:    asr x12, x9, #63
-; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x9, x9, x11
-; CHECK-NEXT:    asr x11, x8, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x12, x12, x13
-; CHECK-NEXT:    adds x9, x15, x9
-; CHECK-NEXT:    adc x10, x10, x12
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w2, ne
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x10, x10, x11
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    b .LBB22_7
+; CHECK-NEXT:  .LBB22_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB22_8
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB22_6
+; CHECK-NEXT:  .LBB22_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB22_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x8, x8, x11
+; CHECK-NEXT:    adc x9, x9, x15
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    eor x8, x8, x12
+; CHECK-NEXT:    add x0, x13, x10
+; CHECK-NEXT:    cmp x0, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x1, x8, lo
+; CHECK-NEXT:    eor x8, x9, x12
+; CHECK-NEXT:    cmp x1, x10
+; CHECK-NEXT:    cinc x8, x8, lo
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:  .LBB22_7: // %overflow.res
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_8: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    and w2, wzr, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -435,7 +712,14 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 
 define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_saturating_mul:
-; CHECK:       // %bb.0:
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB23_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB23_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -448,26 +732,106 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adc x8, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
 ; CHECK-NEXT:    adds x9, x14, x10
-; CHECK-NEXT:    mul x11, x1, x3
-; CHECK-NEXT:    adc x10, x12, x13
-; CHECK-NEXT:    smulh x12, x1, x3
-; CHECK-NEXT:    asr x13, x8, #63
-; CHECK-NEXT:    asr x14, x10, #63
-; CHECK-NEXT:    adds x8, x8, x10
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    adds x8, x11, x8
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    mul x13, x0, x2
-; CHECK-NEXT:    adc x10, x12, x10
-; CHECK-NEXT:    eor x12, x3, x1
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x10, x10, x11
-; CHECK-NEXT:    asr x11, x12, #63
-; CHECK-NEXT:    orr x8, x8, x10
-; CHECK-NEXT:    eor x10, x11, #0x7fffffffffffffff
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csinv x0, x13, x11, eq
-; CHECK-NEXT:    csel x1, x10, x9, ne
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    asr x14, x9, #63
+; CHECK-NEXT:    smulh x10, x1, x3
+; CHECK-NEXT:    adc x11, x12, x13
+; CHECK-NEXT:    asr x12, x8, #63
+; CHECK-NEXT:    asr x13, x11, #63
+; CHECK-NEXT:    adds x11, x8, x11
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    adc x12, x12, x13
+; CHECK-NEXT:    adds x11, x15, x11
+; CHECK-NEXT:    adc x10, x10, x12
+; CHECK-NEXT:    cmp x11, x14
+; CHECK-NEXT:    ccmp x10, x14, #0, eq
+; CHECK-NEXT:    b .LBB23_7
+; CHECK-NEXT:  .LBB23_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB23_8
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB23_6
+; CHECK-NEXT:  .LBB23_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
+; CHECK-NEXT:    asr x10, x1, #63
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB23_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x11, x8, x11
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    add x8, x13, x10
+; CHECK-NEXT:    adc x13, x9, x15
+; CHECK-NEXT:    eor x9, x11, x12
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x9, x9, lo
+; CHECK-NEXT:    cmp x9, x10
+; CHECK-NEXT:    eor x10, x13, x12
+; CHECK-NEXT:    cinc x10, x10, lo
+; CHECK-NEXT:    cmp x10, #0
+; CHECK-NEXT:  .LBB23_7: // %overflow.res
+; CHECK-NEXT:    cset w10, ne
+; CHECK-NEXT:    b .LBB23_9
+; CHECK-NEXT:  .LBB23_8: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    madd x9, x1, x2, x8
+; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:  .LBB23_9: // %overflow.res
+; CHECK-NEXT:    eor x11, x3, x1
+; CHECK-NEXT:    tst w10, #0x1
+; CHECK-NEXT:    asr x11, x11, #63
+; CHECK-NEXT:    eor x12, x11, #0x7fffffffffffffff
+; CHECK-NEXT:    csinv x0, x8, x11, eq
+; CHECK-NEXT:    csel x1, x12, x9, ne
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 9924b7c63f763..ef004085373cd 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -223,22 +223,49 @@ cleanup:
 
 define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_umul_i128:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    cbz x1, .LBB4_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cbz x3, .LBB4_5
+; CHECK-NEXT:  // %bb.2: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    umulh x10, x3, x0
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
 ; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    b.ne .LBB4_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    tbnz w8, #0, .LBB4_7
+; CHECK-NEXT:    b .LBB4_8
+; CHECK-NEXT:  .LBB4_3: // %overflow.no.lhs
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    cbz x3, .LBB4_9
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    madd x8, x1, x2, x8
+; CHECK-NEXT:    umulh x9, x0, x3
+; CHECK-NEXT:    mul x10, x0, x3
+; CHECK-NEXT:    mul x11, x1, x3
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    b .LBB4_6
+; CHECK-NEXT:  .LBB4_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    umulh x8, x2, x0
+; CHECK-NEXT:    umulh x9, x2, x1
+; CHECK-NEXT:    madd x8, x3, x0, x8
+; CHECK-NEXT:    mul x10, x2, x1
+; CHECK-NEXT:    mul x11, x3, x1
+; CHECK-NEXT:    mul x0, x2, x0
+; CHECK-NEXT:  .LBB4_6: // %overflow.res
+; CHECK-NEXT:    adds x1, x8, x10
+; CHECK-NEXT:    adcs xzr, x9, x11
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    tbz w8, #0, .LBB4_8
+; CHECK-NEXT:  .LBB4_7: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -247,10 +274,15 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB4_8: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB4_2: // %if.end
+; CHECK-NEXT:  .LBB4_9: // %overflow.no
+; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbnz w8, #0, .LBB4_7
+; CHECK-NEXT:    b .LBB4_8
 entry:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
@@ -272,35 +304,115 @@ cleanup:
 
 define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_smul_i128:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       // %bb.0: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
+; CHECK-NEXT:    cmp x1, x0, asr #63
+; CHECK-NEXT:    b.eq .LBB5_3
+; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB5_5
+; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    adds x9, x14, x10
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    smulh x10, x1, x3
+; CHECK-NEXT:    mov x1, x9
+; CHECK-NEXT:    adc x9, x12, x13
+; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    adds x8, x8, x9
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    adc x10, x10, x11
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    b .LBB5_7
+; CHECK-NEXT:  .LBB5_3: // %overflow.no.lhs
+; CHECK-NEXT:    cmp x3, x8
+; CHECK-NEXT:    b.eq .LBB5_10
+; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; CHECK-NEXT:    asr x8, x1, #63
+; CHECK-NEXT:    asr x10, x3, #63
+; CHECK-NEXT:    eor x9, x0, x8
+; CHECK-NEXT:    eor x11, x1, x8
+; CHECK-NEXT:    eor x12, x2, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    eor x11, x3, x10
+; CHECK-NEXT:    csel x8, x8, x1, lt
+; CHECK-NEXT:    csel x9, x9, x0, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    csel x11, x12, x2, lt
+; CHECK-NEXT:    csel x10, x10, x3, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w12, w13
+; CHECK-NEXT:    b .LBB5_6
+; CHECK-NEXT:  .LBB5_5: // %overflow.no.rhs.only
+; CHECK-NEXT:    asr x8, x3, #63
 ; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    umulh x11, x0, x2
-; CHECK-NEXT:    asr x14, x3, #63
-; CHECK-NEXT:    mov x8, x1
-; CHECK-NEXT:    mul x12, x1, x2
-; CHECK-NEXT:    umulh x9, x1, x2
-; CHECK-NEXT:    mul x10, x10, x2
-; CHECK-NEXT:    adds x11, x12, x11
-; CHECK-NEXT:    mul x15, x0, x3
-; CHECK-NEXT:    umulh x13, x0, x3
-; CHECK-NEXT:    adc x9, x9, x10
-; CHECK-NEXT:    mul x14, x0, x14
-; CHECK-NEXT:    mul x16, x1, x3
-; CHECK-NEXT:    adds x1, x15, x11
-; CHECK-NEXT:    asr x11, x9, #63
-; CHECK-NEXT:    smulh x8, x8, x3
-; CHECK-NEXT:    adc x10, x13, x14
-; CHECK-NEXT:    asr x12, x10, #63
-; CHECK-NEXT:    adds x9, x9, x10
-; CHECK-NEXT:    adc x10, x11, x12
-; CHECK-NEXT:    adds x9, x16, x9
-; CHECK-NEXT:    asr x11, x1, #63
-; CHECK-NEXT:    adc x8, x8, x10
-; CHECK-NEXT:    eor x8, x8, x11
-; CHECK-NEXT:    eor x9, x9, x11
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cbz x8, .LBB5_2
-; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    eor x9, x2, x8
+; CHECK-NEXT:    eor x11, x3, x8
+; CHECK-NEXT:    eor x12, x0, x10
+; CHECK-NEXT:    subs x9, x9, x8
+; CHECK-NEXT:    sbc x8, x11, x8
+; CHECK-NEXT:    cmp x3, #0
+; CHECK-NEXT:    eor x11, x1, x10
+; CHECK-NEXT:    csel x8, x8, x3, lt
+; CHECK-NEXT:    csel x9, x9, x2, lt
+; CHECK-NEXT:    cset w13, lt
+; CHECK-NEXT:    subs x12, x12, x10
+; CHECK-NEXT:    sbc x10, x11, x10
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    csel x11, x12, x0, lt
+; CHECK-NEXT:    csel x10, x10, x1, lt
+; CHECK-NEXT:    umulh x12, x9, x11
+; CHECK-NEXT:    mul x14, x9, x11
+; CHECK-NEXT:    mul x15, x8, x10
+; CHECK-NEXT:    madd x8, x8, x11, x12
+; CHECK-NEXT:    cset w12, lt
+; CHECK-NEXT:    mul x11, x9, x10
+; CHECK-NEXT:    umulh x9, x9, x10
+; CHECK-NEXT:    eor w10, w13, w12
+; CHECK-NEXT:  .LBB5_6: // %overflow.res
+; CHECK-NEXT:    sbfx x12, x10, #0, #1
+; CHECK-NEXT:    adds x8, x8, x11
+; CHECK-NEXT:    adc x9, x9, x15
+; CHECK-NEXT:    eor x13, x14, x12
+; CHECK-NEXT:    eor x8, x8, x12
+; CHECK-NEXT:    add x0, x13, x10
+; CHECK-NEXT:    cmp x0, x10
+; CHECK-NEXT:    cset w10, lo
+; CHECK-NEXT:    cinc x1, x8, lo
+; CHECK-NEXT:    eor x8, x9, x12
+; CHECK-NEXT:    cmp x1, x10
+; CHECK-NEXT:    cinc x8, x8, lo
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:  .LBB5_7: // %overflow.res
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    tbz w8, #0, .LBB5_9
+; CHECK-NEXT:  .LBB5_8: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -309,10 +421,16 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:  .LBB5_9: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB5_2: // %if.end
+; CHECK-NEXT:  .LBB5_10: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbnz w8, #0, .LBB5_8
+; CHECK-NEXT:    b .LBB5_9
 entry:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index edfd80b4f2706..a240055b3f655 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -3,20 +3,54 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-LABEL: muloti_test:
-; AARCH:       // %bb.0: // %start
+; AARCH:       // %bb.0: // %overflow.entry
+; AARCH-NEXT:    cbz x1, .LBB0_3
+; AARCH-NEXT:  // %bb.1: // %overflow.lhs
+; AARCH-NEXT:    cbz x3, .LBB0_5
+; AARCH-NEXT:  // %bb.2: // %overflow
 ; AARCH-NEXT:    mul x9, x3, x0
 ; AARCH-NEXT:    cmp x1, #0
 ; AARCH-NEXT:    ccmp x3, #0, #4, ne
-; AARCH-NEXT:    umulh x8, x1, x2
-; AARCH-NEXT:    umulh x10, x3, x0
+; AARCH-NEXT:    umulh x10, x1, x2
+; AARCH-NEXT:    umulh x8, x3, x0
 ; AARCH-NEXT:    madd x9, x1, x2, x9
-; AARCH-NEXT:    ccmp xzr, x8, #0, eq
-; AARCH-NEXT:    umulh x11, x0, x2
 ; AARCH-NEXT:    ccmp xzr, x10, #0, eq
+; AARCH-NEXT:    umulh x11, x0, x2
+; AARCH-NEXT:    ccmp xzr, x8, #0, eq
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    cset w8, ne
 ; AARCH-NEXT:    adds x1, x11, x9
-; AARCH-NEXT:    csinc w2, w8, wzr, lo
+; AARCH-NEXT:    csinc w8, w8, wzr, lo
+; AARCH-NEXT:    and w2, w8, #0x1
+; AARCH-NEXT:    ret
+; AARCH-NEXT:  .LBB0_3: // %overflow.no.lhs
+; AARCH-NEXT:    umulh x8, x0, x2
+; AARCH-NEXT:    cbz x3, .LBB0_7
+; AARCH-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; AARCH-NEXT:    madd x8, x1, x2, x8
+; AARCH-NEXT:    umulh x9, x0, x3
+; AARCH-NEXT:    mul x10, x0, x3
+; AARCH-NEXT:    mul x11, x1, x3
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    b .LBB0_6
+; AARCH-NEXT:  .LBB0_5: // %overflow.no.rhs.only
+; AARCH-NEXT:    umulh x8, x2, x0
+; AARCH-NEXT:    umulh x9, x2, x1
+; AARCH-NEXT:    madd x8, x3, x0, x8
+; AARCH-NEXT:    mul x10, x2, x1
+; AARCH-NEXT:    mul x11, x3, x1
+; AARCH-NEXT:    mul x0, x2, x0
+; AARCH-NEXT:  .LBB0_6: // %overflow.res
+; AARCH-NEXT:    adds x1, x8, x10
+; AARCH-NEXT:    adcs xzr, x9, x11
+; AARCH-NEXT:    cset w8, ne
+; AARCH-NEXT:    and w2, w8, #0x1
+; AARCH-NEXT:    ret
+; AARCH-NEXT:  .LBB0_7: // %overflow.no
+; AARCH-NEXT:    madd x8, x0, x3, x8
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    madd x1, x1, x2, x8
+; AARCH-NEXT:    and w2, wzr, #0x1
 ; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
@@ -34,46 +68,133 @@ start:
 
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
-; AARCH:       // %bb.0: // %Entry
-; AARCH-NEXT:    asr x11, x1, #63
-; AARCH-NEXT:    asr x9, x3, #63
-; AARCH-NEXT:    umulh x12, x0, x2
-; AARCH-NEXT:    mov x8, x1
+; AARCH:       // %bb.0: // %overflow.entry
+; AARCH-NEXT:    asr x8, x2, #63
+; AARCH-NEXT:    cmp x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    mul x13, x1, x2
-; AARCH-NEXT:    umulh x10, x1, x2
-; AARCH-NEXT:    mul x11, x11, x2
-; AARCH-NEXT:    adds x12, x13, x12
-; AARCH-NEXT:    mul x15, x0, x3
-; AARCH-NEXT:    umulh x14, x0, x3
-; AARCH-NEXT:    adc x10, x10, x11
-; AARCH-NEXT:    mul x9, x0, x9
-; AARCH-NEXT:    mul x16, x1, x3
-; AARCH-NEXT:    adds x1, x15, x12
-; AARCH-NEXT:    asr x12, x10, #63
-; AARCH-NEXT:    smulh x11, x8, x3
-; AARCH-NEXT:    adc x9, x14, x9
-; AARCH-NEXT:    asr x13, x9, #63
-; AARCH-NEXT:    adds x9, x10, x9
-; AARCH-NEXT:    asr x10, x1, #63
+; AARCH-NEXT:    b.eq .LBB1_3
+; AARCH-NEXT:  // %bb.1: // %overflow.lhs
+; AARCH-NEXT:    cmp x3, x8
+; AARCH-NEXT:    b.eq .LBB1_5
+; AARCH-NEXT:  // %bb.2: // %overflow
+; AARCH-NEXT:    asr x9, x1, #63
+; AARCH-NEXT:    umulh x10, x0, x2
+; AARCH-NEXT:    asr x13, x3, #63
+; AARCH-NEXT:    mul x11, x1, x2
+; AARCH-NEXT:    umulh x8, x1, x2
+; AARCH-NEXT:    mul x9, x9, x2
+; AARCH-NEXT:    adds x10, x11, x10
+; AARCH-NEXT:    mul x14, x0, x3
+; AARCH-NEXT:    umulh x12, x0, x3
+; AARCH-NEXT:    adc x9, x8, x9
+; AARCH-NEXT:    mul x13, x0, x13
+; AARCH-NEXT:    adds x8, x14, x10
+; AARCH-NEXT:    mul x15, x1, x3
+; AARCH-NEXT:    smulh x10, x1, x3
+; AARCH-NEXT:    adc x11, x12, x13
+; AARCH-NEXT:    asr x12, x9, #63
+; AARCH-NEXT:    asr x13, x11, #63
 ; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    adds x9, x9, x11
+; AARCH-NEXT:    asr x11, x8, #63
 ; AARCH-NEXT:    adc x12, x12, x13
-; AARCH-NEXT:    adds x9, x16, x9
-; AARCH-NEXT:    adc x11, x11, x12
-; AARCH-NEXT:    cmp x9, x10
-; AARCH-NEXT:    ccmp x11, x10, #0, eq
+; AARCH-NEXT:    adds x9, x15, x9
+; AARCH-NEXT:    adc x10, x10, x12
+; AARCH-NEXT:    cmp x9, x11
+; AARCH-NEXT:    ccmp x10, x11, #0, eq
+; AARCH-NEXT:    b .LBB1_7
+; AARCH-NEXT:  .LBB1_3: // %overflow.no.lhs
+; AARCH-NEXT:    cmp x3, x8
+; AARCH-NEXT:    b.eq .LBB1_8
+; AARCH-NEXT:  // %bb.4: // %overflow.no.lhs.only
+; AARCH-NEXT:    asr x8, x1, #63
+; AARCH-NEXT:    asr x10, x3, #63
+; AARCH-NEXT:    eor x9, x0, x8
+; AARCH-NEXT:    eor x11, x1, x8
+; AARCH-NEXT:    eor x12, x2, x10
+; AARCH-NEXT:    subs x9, x9, x8
+; AARCH-NEXT:    sbc x8, x11, x8
+; AARCH-NEXT:    cmp x1, #0
+; AARCH-NEXT:    eor x11, x3, x10
+; AARCH-NEXT:    cset w13, lt
+; AARCH-NEXT:    csel x8, x8, x1, lt
+; AARCH-NEXT:    csel x9, x9, x0, lt
+; AARCH-NEXT:    subs x12, x12, x10
+; AARCH-NEXT:    sbc x10, x11, x10
+; AARCH-NEXT:    cmp x3, #0
+; AARCH-NEXT:    csel x11, x12, x2, lt
+; AARCH-NEXT:    csel x10, x10, x3, lt
+; AARCH-NEXT:    umulh x12, x9, x11
+; AARCH-NEXT:    mul x15, x8, x10
+; AARCH-NEXT:    madd x8, x8, x11, x12
+; AARCH-NEXT:    cset w12, lt
+; AARCH-NEXT:    mul x14, x9, x11
+; AARCH-NEXT:    mul x11, x9, x10
+; AARCH-NEXT:    umulh x9, x9, x10
+; AARCH-NEXT:    eor w10, w12, w13
+; AARCH-NEXT:    b .LBB1_6
+; AARCH-NEXT:  .LBB1_5: // %overflow.no.rhs.only
+; AARCH-NEXT:    asr x8, x3, #63
+; AARCH-NEXT:    asr x10, x1, #63
+; AARCH-NEXT:    eor x9, x2, x8
+; AARCH-NEXT:    eor x11, x3, x8
+; AARCH-NEXT:    eor x12, x0, x10
+; AARCH-NEXT:    subs x9, x9, x8
+; AARCH-NEXT:    sbc x8, x11, x8
+; AARCH-NEXT:    cmp x3, #0
+; AARCH-NEXT:    eor x11, x1, x10
+; AARCH-NEXT:    cset w13, lt
+; AARCH-NEXT:    csel x8, x8, x3, lt
+; AARCH-NEXT:    csel x9, x9, x2, lt
+; AARCH-NEXT:    subs x12, x12, x10
+; AARCH-NEXT:    sbc x10, x11, x10
+; AARCH-NEXT:    cmp x1, #0
+; AARCH-NEXT:    csel x11, x12, x0, lt
+; AARCH-NEXT:    csel x10, x10, x1, lt
+; AARCH-NEXT:    umulh x12, x9, x11
+; AARCH-NEXT:    mul x14, x9, x11
+; AARCH-NEXT:    mul x15, x8, x10
+; AARCH-NEXT:    madd x8, x8, x11, x12
+; AARCH-NEXT:    cset w12, lt
+; AARCH-NEXT:    mul x11, x9, x10
+; AARCH-NEXT:    umulh x9, x9, x10
+; AARCH-NEXT:    eor w10, w13, w12
+; AARCH-NEXT:  .LBB1_6: // %overflow.res
+; AARCH-NEXT:    sbfx x12, x10, #0, #1
+; AARCH-NEXT:    adds x8, x8, x11
+; AARCH-NEXT:    adc x9, x9, x15
+; AARCH-NEXT:    eor x13, x14, x12
+; AARCH-NEXT:    eor x8, x8, x12
+; AARCH-NEXT:    eor x9, x9, x12
+; AARCH-NEXT:    add x0, x13, x10
+; AARCH-NEXT:    cmp x0, x10
+; AARCH-NEXT:    cset w10, lo
+; AARCH-NEXT:    cinc x8, x8, lo
+; AARCH-NEXT:    cmp x8, x10
+; AARCH-NEXT:    cinc x9, x9, lo
+; AARCH-NEXT:    cmp x9, #0
+; AARCH-NEXT:  .LBB1_7: // %overflow.res
 ; AARCH-NEXT:    cset w9, ne
-; AARCH-NEXT:    tbz x8, #63, .LBB1_2
-; AARCH-NEXT:  // %bb.1: // %Entry
-; AARCH-NEXT:    eor x8, x3, #0x8000000000000000
-; AARCH-NEXT:    orr x8, x2, x8
-; AARCH-NEXT:    cbz x8, .LBB1_3
-; AARCH-NEXT:  .LBB1_2: // %Else2
-; AARCH-NEXT:    cbz w9, .LBB1_4
-; AARCH-NEXT:  .LBB1_3: // %Then7
-; AARCH-NEXT:    mov w8, #1 // =0x1
-; AARCH-NEXT:    str w8, [x4]
-; AARCH-NEXT:  .LBB1_4: // %Block9
+; AARCH-NEXT:    tbnz x1, #63, .LBB1_9
+; AARCH-NEXT:    b .LBB1_10
+; AARCH-NEXT:  .LBB1_8: // %overflow.no
+; AARCH-NEXT:    umulh x8, x0, x2
+; AARCH-NEXT:    mov w9, wzr
+; AARCH-NEXT:    madd x8, x0, x3, x8
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    madd x8, x1, x2, x8
+; AARCH-NEXT:    tbz x1, #63, .LBB1_10
+; AARCH-NEXT:  .LBB1_9: // %overflow.res
+; AARCH-NEXT:    eor x10, x3, #0x8000000000000000
+; AARCH-NEXT:    orr x10, x2, x10
+; AARCH-NEXT:    cbz x10, .LBB1_11
+; AARCH-NEXT:  .LBB1_10: // %Else2
+; AARCH-NEXT:    tbz w9, #0, .LBB1_12
+; AARCH-NEXT:  .LBB1_11: // %Then7
+; AARCH-NEXT:    mov w9, #1 // =0x1
+; AARCH-NEXT:    str w9, [x4]
+; AARCH-NEXT:  .LBB1_12: // %Block9
+; AARCH-NEXT:    mov x1, x8
 ; AARCH-NEXT:    ret
 Entry:
   store i32 0, ptr %2, align 4
diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
index 4eb82c80e2bff..8f35b6df7a937 100644
--- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
@@ -4,212 +4,425 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-LABEL: muloti_test:
-; ARMV6:       @ %bb.0: @ %start
+; ARMV6:       @ %bb.0: @ %overflow.entry
 ; ARMV6-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ARMV6-NEXT:    sub sp, sp, #28
-; ARMV6-NEXT:    ldr r4, [sp, #72]
-; ARMV6-NEXT:    mov r7, r0
-; ARMV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r12, [sp, #64]
-; ARMV6-NEXT:    umull r1, r0, r2, r4
+; ARMV6-NEXT:    add lr, sp, #76
 ; ARMV6-NEXT:    ldr r5, [sp, #68]
-; ARMV6-NEXT:    str r1, [r7]
-; ARMV6-NEXT:    ldr r1, [sp, #76]
-; ARMV6-NEXT:    umull r7, r6, r1, r12
-; ARMV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
-; ARMV6-NEXT:    umull r6, r9, r5, r4
-; ARMV6-NEXT:    add r7, r6, r7
-; ARMV6-NEXT:    umull r4, r6, r12, r4
-; ARMV6-NEXT:    str r4, [sp, #16] @ 4-byte Spill
-; ARMV6-NEXT:    mov r4, #0
-; ARMV6-NEXT:    adds r8, r6, r7
-; ARMV6-NEXT:    ldr r6, [sp, #80]
-; ARMV6-NEXT:    adc r7, r4, #0
-; ARMV6-NEXT:    ldr r4, [sp, #84]
-; ARMV6-NEXT:    str r7, [sp, #24] @ 4-byte Spill
-; ARMV6-NEXT:    umull r12, lr, r3, r6
-; ARMV6-NEXT:    umull r11, r7, r4, r2
-; ARMV6-NEXT:    add r12, r11, r12
-; ARMV6-NEXT:    umull r11, r10, r6, r2
-; ARMV6-NEXT:    adds r12, r10, r12
-; ARMV6-NEXT:    mov r10, #0
-; ARMV6-NEXT:    adc r6, r10, #0
-; ARMV6-NEXT:    str r6, [sp, #20] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
-; ARMV6-NEXT:    adds r6, r6, r11
-; ARMV6-NEXT:    str r6, [sp, #12] @ 4-byte Spill
-; ARMV6-NEXT:    adc r6, r8, r12
-; ARMV6-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r6, [sp, #72]
-; ARMV6-NEXT:    mov r12, #0
-; ARMV6-NEXT:    umull r2, r8, r2, r1
-; ARMV6-NEXT:    umlal r0, r12, r3, r6
-; ARMV6-NEXT:    adds r0, r2, r0
-; ARMV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; ARMV6-NEXT:    adcs r8, r12, r8
-; ARMV6-NEXT:    adc r12, r10, #0
-; ARMV6-NEXT:    cmp lr, #0
-; ARMV6-NEXT:    str r0, [r2, #4]
-; ARMV6-NEXT:    movne lr, #1
-; ARMV6-NEXT:    ldr r11, [sp, #8] @ 4-byte Reload
-; ARMV6-NEXT:    cmp r7, #0
-; ARMV6-NEXT:    movne r7, #1
-; ARMV6-NEXT:    ldr r0, [sp, #64]
-; ARMV6-NEXT:    cmp r11, #0
-; ARMV6-NEXT:    umlal r8, r12, r3, r1
-; ARMV6-NEXT:    movne r11, #1
-; ARMV6-NEXT:    cmp r9, #0
-; ARMV6-NEXT:    movne r9, #1
-; ARMV6-NEXT:    orrs r10, r0, r5
-; ARMV6-NEXT:    ldr r0, [sp, #80]
+; ARMV6-NEXT:    ldr r6, [sp, #64]
+; ARMV6-NEXT:    mov r9, r0
+; ARMV6-NEXT:    ldr r11, [sp, #72]
+; ARMV6-NEXT:    orrs r10, r6, r5
+; ARMV6-NEXT:    ldm lr, {r1, r12, lr}
+; ARMV6-NEXT:    beq .LBB0_3
+; ARMV6-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV6-NEXT:    orrs r8, r12, lr
+; ARMV6-NEXT:    beq .LBB0_5
+; ARMV6-NEXT:  @ %bb.2: @ %overflow
+; ARMV6-NEXT:    umull r4, r0, r3, r12
+; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    umull r7, r0, lr, r2
+; ARMV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; ARMV6-NEXT:    umull r0, r12, r12, r2
+; ARMV6-NEXT:    add r4, r7, r4
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    adds r7, r12, r4
+; ARMV6-NEXT:    str r7, [sp] @ 4-byte Spill
+; ARMV6-NEXT:    adc r0, r0, #0
+; ARMV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMV6-NEXT:    mov r0, r11
+; ARMV6-NEXT:    umull r11, r12, r1, r6
+; ARMV6-NEXT:    umull r7, r4, r5, r0
+; ARMV6-NEXT:    add r7, r7, r11
+; ARMV6-NEXT:    umull r11, r6, r6, r0
+; ARMV6-NEXT:    adds r6, r6, r7
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    adc r7, r7, #0
+; ARMV6-NEXT:    str r7, [sp, #4] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    adds r7, r11, r7
+; ARMV6-NEXT:    str r7, [sp, #8] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r7, [sp] @ 4-byte Reload
+; ARMV6-NEXT:    adc r6, r6, r7
+; ARMV6-NEXT:    str r6, [sp] @ 4-byte Spill
+; ARMV6-NEXT:    umull r11, r6, r2, r0
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    umlal r6, r7, r3, r0
+; ARMV6-NEXT:    umull r2, r0, r2, r1
+; ARMV6-NEXT:    adds r2, r2, r6
+; ARMV6-NEXT:    str r2, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    adcs r0, r7, r0
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    adc r6, r7, #0
+; ARMV6-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV6-NEXT:    umlal r0, r6, r3, r1
+; ARMV6-NEXT:    adds r2, r0, r2
+; ARMV6-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r0, r6, r0
+; ARMV6-NEXT:    adc r6, r7, #0
+; ARMV6-NEXT:    cmp r8, #0
+; ARMV6-NEXT:    movne r8, #1
+; ARMV6-NEXT:    cmp r10, #0
 ; ARMV6-NEXT:    movne r10, #1
-; ARMV6-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
-; ARMV6-NEXT:    orrs r0, r0, r4
-; ARMV6-NEXT:    movne r0, #1
 ; ARMV6-NEXT:    cmp r4, #0
 ; ARMV6-NEXT:    movne r4, #1
-; ARMV6-NEXT:    cmp r3, #0
-; ARMV6-NEXT:    movne r3, #1
-; ARMV6-NEXT:    cmp r5, #0
-; ARMV6-NEXT:    movne r5, #1
 ; ARMV6-NEXT:    cmp r1, #0
 ; ARMV6-NEXT:    movne r1, #1
-; ARMV6-NEXT:    adds r6, r8, r6
-; ARMV6-NEXT:    str r6, [r2, #8]
+; ARMV6-NEXT:    cmp r5, #0
+; ARMV6-NEXT:    movne r5, #1
 ; ARMV6-NEXT:    and r1, r5, r1
+; ARMV6-NEXT:    cmp r12, #0
+; ARMV6-NEXT:    orr r1, r1, r4
+; ARMV6-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
+; ARMV6-NEXT:    movne r12, #1
+; ARMV6-NEXT:    orr r1, r1, r12
+; ARMV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; ARMV6-NEXT:    and r6, r10, r8
+; ARMV6-NEXT:    orr r1, r1, r5
+; ARMV6-NEXT:    orr r1, r6, r1
+; ARMV6-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
+; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    cmp r6, #0
+; ARMV6-NEXT:    movne r6, #1
+; ARMV6-NEXT:    cmp r3, #0
+; ARMV6-NEXT:    movne r3, #1
+; ARMV6-NEXT:    cmp lr, #0
+; ARMV6-NEXT:    movne lr, #1
+; ARMV6-NEXT:    and r3, lr, r3
+; ARMV6-NEXT:    orr r3, r3, r6
+; ARMV6-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
+; ARMV6-NEXT:    cmp r6, #0
+; ARMV6-NEXT:    movne r6, #1
+; ARMV6-NEXT:    orr r3, r3, r6
 ; ARMV6-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
-; ARMV6-NEXT:    orr r1, r1, r9
-; ARMV6-NEXT:    orr r1, r1, r11
-; ARMV6-NEXT:    and r0, r10, r0
-; ARMV6-NEXT:    adcs r6, r12, r6
-; ARMV6-NEXT:    str r6, [r2, #12]
-; ARMV6-NEXT:    ldr r6, [sp, #24] @ 4-byte Reload
-; ARMV6-NEXT:    orr r1, r1, r6
-; ARMV6-NEXT:    orr r0, r0, r1
-; ARMV6-NEXT:    and r1, r4, r3
-; ARMV6-NEXT:    orr r1, r1, r7
-; ARMV6-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
-; ARMV6-NEXT:    orr r1, r1, lr
+; ARMV6-NEXT:    orr r3, r3, r6
 ; ARMV6-NEXT:    orr r1, r1, r3
-; ARMV6-NEXT:    orr r0, r0, r1
-; ARMV6-NEXT:    mov r1, #0
-; ARMV6-NEXT:    adc r1, r1, #0
-; ARMV6-NEXT:    orr r0, r0, r1
-; ARMV6-NEXT:    and r0, r0, #1
-; ARMV6-NEXT:    strb r0, [r2, #16]
+; ARMV6-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; ARMV6-NEXT:    orr r6, r1, r3
+; ARMV6-NEXT:    b .LBB0_8
+; ARMV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV6-NEXT:    orrs r6, r12, lr
+; ARMV6-NEXT:    beq .LBB0_7
+; ARMV6-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV6-NEXT:    umull r0, r4, r2, r12
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    mov r10, #0
+; ARMV6-NEXT:    umlal r4, r7, r3, r12
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    umull r6, r8, r2, lr
+; ARMV6-NEXT:    adds r0, r6, r4
+; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    adcs r6, r7, r8
+; ARMV6-NEXT:    adc r7, r10, #0
+; ARMV6-NEXT:    ldr r10, [sp, #64]
+; ARMV6-NEXT:    umlal r6, r7, r3, lr
+; ARMV6-NEXT:    umull r0, r8, r12, r10
+; ARMV6-NEXT:    mla r4, r12, r5, r8
+; ARMV6-NEXT:    mov r8, r11
+; ARMV6-NEXT:    adds r12, r6, r0
+; ARMV6-NEXT:    mov r6, #0
+; ARMV6-NEXT:    mla r4, lr, r10, r4
+; ARMV6-NEXT:    adc lr, r7, r4
+; ARMV6-NEXT:    umull r11, r4, r2, r11
+; ARMV6-NEXT:    umlal r4, r6, r3, r8
+; ARMV6-NEXT:    umull r2, r0, r2, r1
+; ARMV6-NEXT:    adds r7, r2, r4
+; ARMV6-NEXT:    adcs r2, r6, r0
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    adc r4, r0, #0
+; ARMV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    umlal r2, r4, r3, r1
+; ARMV6-NEXT:    umull r3, r6, r8, r10
+; ARMV6-NEXT:    mla r5, r8, r5, r6
+; ARMV6-NEXT:    adds r2, r2, r3
+; ARMV6-NEXT:    mla r1, r1, r10, r5
+; ARMV6-NEXT:    adc r1, r4, r1
+; ARMV6-NEXT:    adds r2, r2, r0
+; ARMV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r0, r1, r0
+; ARMV6-NEXT:    adcs r1, r12, #0
+; ARMV6-NEXT:    adc r3, lr, #0
+; ARMV6-NEXT:    b .LBB0_6
+; ARMV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV6-NEXT:    mov r10, r6
+; ARMV6-NEXT:    umull r0, r6, r11, r6
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    umlal r6, r7, r1, r10
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    umull r4, r8, r11, r5
+; ARMV6-NEXT:    adds r0, r4, r6
+; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    adcs r6, r7, r8
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    adc r7, r0, #0
+; ARMV6-NEXT:    umull r0, r8, r10, r12
+; ARMV6-NEXT:    mla r4, r10, lr, r8
+; ARMV6-NEXT:    umlal r6, r7, r1, r5
+; ARMV6-NEXT:    mla r4, r5, r12, r4
+; ARMV6-NEXT:    adds r10, r6, r0
+; ARMV6-NEXT:    adc r0, r7, r4
+; ARMV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARMV6-NEXT:    mov r0, r11
+; ARMV6-NEXT:    umull r11, r6, r11, r2
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    umull r4, r5, r0, r3
+; ARMV6-NEXT:    mov r0, #0
+; ARMV6-NEXT:    umlal r6, r7, r1, r2
+; ARMV6-NEXT:    adds r8, r4, r6
+; ARMV6-NEXT:    adcs r4, r7, r5
+; ARMV6-NEXT:    adc r5, r0, #0
+; ARMV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    umlal r4, r5, r1, r3
+; ARMV6-NEXT:    mov r7, r8
+; ARMV6-NEXT:    umull r1, r6, r2, r12
+; ARMV6-NEXT:    mla r2, r2, lr, r6
+; ARMV6-NEXT:    adds r1, r4, r1
+; ARMV6-NEXT:    mla r2, r3, r12, r2
+; ARMV6-NEXT:    adc r3, r5, r2
+; ARMV6-NEXT:    adds r2, r1, r0
+; ARMV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r0, r3, r0
+; ARMV6-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r1, r10, #0
+; ARMV6-NEXT:    adc r3, r3, #0
+; ARMV6-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV6-NEXT:    orrs r6, r1, r3
+; ARMV6-NEXT:    movne r6, #1
+; ARMV6-NEXT:    b .LBB0_8
+; ARMV6-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV6-NEXT:    mov r0, r11
+; ARMV6-NEXT:    umull r11, r8, r2, r11
+; ARMV6-NEXT:    mov r7, #0
+; ARMV6-NEXT:    mov r6, #0
+; ARMV6-NEXT:    umlal r8, r7, r3, r0
+; ARMV6-NEXT:    umull r4, r10, r2, r1
+; ARMV6-NEXT:    adds r0, r4, r8
+; ARMV6-NEXT:    ldr r4, [sp, #64]
+; ARMV6-NEXT:    adcs r10, r7, r10
+; ARMV6-NEXT:    ldr r7, [sp, #72]
+; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    adc r0, r6, #0
+; ARMV6-NEXT:    umlal r10, r0, r3, r1
+; ARMV6-NEXT:    umull r8, r4, r7, r4
+; ARMV6-NEXT:    mla r4, r7, r5, r4
+; ARMV6-NEXT:    ldr r5, [sp, #64]
+; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    mla r1, r1, r5, r4
+; ARMV6-NEXT:    umull r4, r5, r12, r2
+; ARMV6-NEXT:    mla r3, r12, r3, r5
+; ARMV6-NEXT:    mla r2, lr, r2, r3
+; ARMV6-NEXT:    adds r3, r4, r8
+; ARMV6-NEXT:    adc r1, r2, r1
+; ARMV6-NEXT:    adds r2, r10, r3
+; ARMV6-NEXT:    adc r0, r0, r1
+; ARMV6-NEXT:  .LBB0_8: @ %overflow.res
+; ARMV6-NEXT:    str r11, [r9]
+; ARMV6-NEXT:    str r7, [r9, #4]
+; ARMV6-NEXT:    str r2, [r9, #8]
+; ARMV6-NEXT:    str r0, [r9, #12]
+; ARMV6-NEXT:    and r0, r6, #1
+; ARMV6-NEXT:    strb r0, [r9, #16]
 ; ARMV6-NEXT:    add sp, sp, #28
 ; ARMV6-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ;
 ; ARMV7-LABEL: muloti_test:
-; ARMV7:       @ %bb.0: @ %start
+; ARMV7:       @ %bb.0: @ %overflow.entry
 ; ARMV7-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; ARMV7-NEXT:    sub sp, sp, #44
-; ARMV7-NEXT:    ldr r8, [sp, #88]
-; ARMV7-NEXT:    mov r9, r0
-; ARMV7-NEXT:    ldr r7, [sp, #96]
-; ARMV7-NEXT:    ldr lr, [sp, #100]
-; ARMV7-NEXT:    umull r0, r5, r2, r8
-; ARMV7-NEXT:    ldr r4, [sp, #80]
-; ARMV7-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; ARMV7-NEXT:    umull r1, r0, r3, r7
-; ARMV7-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; ARMV7-NEXT:    umull r0, r11, lr, r2
-; ARMV7-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r1, [sp, #92]
-; ARMV7-NEXT:    str r0, [sp] @ 4-byte Spill
-; ARMV7-NEXT:    umull r0, r10, r7, r2
-; ARMV7-NEXT:    mov r7, r1
-; ARMV7-NEXT:    umull r6, r12, r1, r4
-; ARMV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r0, [sp, #84]
-; ARMV7-NEXT:    str r6, [sp, #24] @ 4-byte Spill
-; ARMV7-NEXT:    umull r6, r1, r0, r8
-; ARMV7-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; ARMV7-NEXT:    umull r6, r2, r2, r7
-; ARMV7-NEXT:    mov r7, r4
-; ARMV7-NEXT:    str r6, [sp, #8] @ 4-byte Spill
-; ARMV7-NEXT:    str r2, [sp, #12] @ 4-byte Spill
-; ARMV7-NEXT:    umull r2, r6, r4, r8
-; ARMV7-NEXT:    str r2, [sp, #36] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; ARMV7-NEXT:    str r6, [sp, #28] @ 4-byte Spill
-; ARMV7-NEXT:    mov r6, #0
-; ARMV7-NEXT:    str r2, [r9]
-; ARMV7-NEXT:    umlal r5, r6, r3, r8
-; ARMV7-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
-; ARMV7-NEXT:    ldr r4, [sp] @ 4-byte Reload
-; ARMV7-NEXT:    add r4, r4, r2
-; ARMV7-NEXT:    adds r2, r10, r4
-; ARMV7-NEXT:    str r2, [sp, #20] @ 4-byte Spill
-; ARMV7-NEXT:    mov r2, #0
-; ARMV7-NEXT:    adc r2, r2, #0
-; ARMV7-NEXT:    cmp r12, #0
-; ARMV7-NEXT:    str r2, [sp, #32] @ 4-byte Spill
-; ARMV7-NEXT:    movwne r12, #1
+; ARMV7-NEXT:    sub sp, sp, #12
+; ARMV7-NEXT:    ldr r7, [sp, #52]
+; ARMV7-NEXT:    ldr r10, [sp, #48]
+; ARMV7-NEXT:    ldr r4, [sp, #68]
+; ARMV7-NEXT:    ldr r9, [sp, #64]
+; ARMV7-NEXT:    orrs r1, r10, r7
+; ARMV7-NEXT:    ldr r12, [sp, #60]
+; ARMV7-NEXT:    ldr lr, [sp, #56]
+; ARMV7-NEXT:    beq .LBB0_3
+; ARMV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV7-NEXT:    orr r5, r9, r4
+; ARMV7-NEXT:    cmp r5, #0
+; ARMV7-NEXT:    beq .LBB0_5
+; ARMV7-NEXT:  @ %bb.2: @ %overflow
+; ARMV7-NEXT:    movwne r5, #1
 ; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    ldr r2, [sp, #96]
+; ARMV7-NEXT:    mov r6, r12
 ; ARMV7-NEXT:    movwne r1, #1
-; ARMV7-NEXT:    orrs r10, r7, r0
-; ARMV7-NEXT:    movwne r10, #1
-; ARMV7-NEXT:    orrs r7, r2, lr
-; ARMV7-NEXT:    ldr r2, [sp, #92]
+; ARMV7-NEXT:    and r12, r1, r5
+; ARMV7-NEXT:    cmp r6, #0
+; ARMV7-NEXT:    mov r1, r6
+; ARMV7-NEXT:    mov r8, r6
+; ARMV7-NEXT:    umull r6, r5, r7, lr
+; ARMV7-NEXT:    movwne r1, #1
+; ARMV7-NEXT:    cmp r7, #0
 ; ARMV7-NEXT:    movwne r7, #1
-; ARMV7-NEXT:    cmp r0, #0
-; ARMV7-NEXT:    movwne r0, #1
-; ARMV7-NEXT:    cmp r2, #0
-; ARMV7-NEXT:    mov r4, r2
-; ARMV7-NEXT:    mov r8, r2
-; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    and r1, r7, r1
+; ARMV7-NEXT:    mov r11, #0
+; ARMV7-NEXT:    cmp r5, #0
+; ARMV7-NEXT:    movwne r5, #1
+; ARMV7-NEXT:    orr r1, r1, r5
+; ARMV7-NEXT:    umull r5, r7, r8, r10
+; ARMV7-NEXT:    cmp r7, #0
+; ARMV7-NEXT:    movwne r7, #1
+; ARMV7-NEXT:    orr r7, r1, r7
+; ARMV7-NEXT:    add r1, r6, r5
+; ARMV7-NEXT:    umull r8, r6, r10, lr
+; ARMV7-NEXT:    adds r10, r6, r1
+; ARMV7-NEXT:    umull r6, r1, r4, r2
+; ARMV7-NEXT:    adc r5, r11, #0
+; ARMV7-NEXT:    orr r5, r7, r5
+; ARMV7-NEXT:    orr r7, r12, r5
+; ARMV7-NEXT:    cmp r3, #0
+; ARMV7-NEXT:    mov r5, r3
+; ARMV7-NEXT:    movwne r5, #1
+; ARMV7-NEXT:    cmp r4, #0
 ; ARMV7-NEXT:    movwne r4, #1
-; ARMV7-NEXT:    and r0, r0, r4
-; ARMV7-NEXT:    mov r4, #0
-; ARMV7-NEXT:    adds r5, r2, r5
-; ARMV7-NEXT:    str r5, [r9, #4]
-; ARMV7-NEXT:    orr r0, r0, r1
-; ARMV7-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; ARMV7-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
-; ARMV7-NEXT:    and r5, r10, r7
-; ARMV7-NEXT:    orr r0, r0, r12
-; ARMV7-NEXT:    mov r12, #0
-; ARMV7-NEXT:    add r1, r2, r1
-; ARMV7-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; ARMV7-NEXT:    adcs r2, r6, r2
-; ARMV7-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
-; ARMV7-NEXT:    adc r7, r4, #0
-; ARMV7-NEXT:    adds r1, r6, r1
-; ARMV7-NEXT:    umlal r2, r7, r3, r8
-; ARMV7-NEXT:    adc r4, r4, #0
-; ARMV7-NEXT:    orr r0, r0, r4
-; ARMV7-NEXT:    orr r0, r5, r0
-; ARMV7-NEXT:    ldr r4, [sp, #40] @ 4-byte Reload
-; ARMV7-NEXT:    ldr r5, [sp, #36] @ 4-byte Reload
-; ARMV7-NEXT:    adds r5, r5, r4
-; ARMV7-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; ARMV7-NEXT:    adc r1, r1, r4
-; ARMV7-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    cmp r1, #0
+; ARMV7-NEXT:    and r5, r4, r5
+; ARMV7-NEXT:    movwne r1, #1
+; ARMV7-NEXT:    orr r1, r5, r1
+; ARMV7-NEXT:    umull r5, r4, r3, r9
 ; ARMV7-NEXT:    cmp r4, #0
+; ARMV7-NEXT:    add r6, r6, r5
 ; ARMV7-NEXT:    movwne r4, #1
-; ARMV7-NEXT:    cmp r3, #0
-; ARMV7-NEXT:    movwne r3, #1
-; ARMV7-NEXT:    cmp lr, #0
-; ARMV7-NEXT:    movwne lr, #1
-; ARMV7-NEXT:    cmp r11, #0
-; ARMV7-NEXT:    movwne r11, #1
-; ARMV7-NEXT:    adds r2, r2, r5
-; ARMV7-NEXT:    and r3, lr, r3
-; ARMV7-NEXT:    str r2, [r9, #8]
-; ARMV7-NEXT:    adcs r1, r7, r1
-; ARMV7-NEXT:    str r1, [r9, #12]
-; ARMV7-NEXT:    orr r1, r3, r11
-; ARMV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
 ; ARMV7-NEXT:    orr r1, r1, r4
-; ARMV7-NEXT:    orr r1, r1, r2
-; ARMV7-NEXT:    orr r0, r0, r1
-; ARMV7-NEXT:    adc r1, r12, #0
-; ARMV7-NEXT:    orr r0, r0, r1
-; ARMV7-NEXT:    and r0, r0, #1
-; ARMV7-NEXT:    strb r0, [r9, #16]
-; ARMV7-NEXT:    add sp, sp, #44
+; ARMV7-NEXT:    umull r5, r4, r9, r2
+; ARMV7-NEXT:    adds r6, r4, r6
+; ARMV7-NEXT:    adc r4, r11, #0
+; ARMV7-NEXT:    orr r1, r1, r4
+; ARMV7-NEXT:    mov r4, #0
+; ARMV7-NEXT:    orr r12, r7, r1
+; ARMV7-NEXT:    adds r7, r8, r5
+; ARMV7-NEXT:    umull r8, r5, r2, lr
+; ARMV7-NEXT:    adc r6, r10, r6
+; ARMV7-NEXT:    umlal r5, r4, r3, lr
+; ARMV7-NEXT:    ldr lr, [sp, #60]
+; ARMV7-NEXT:    umull r2, r1, r2, lr
+; ARMV7-NEXT:    adds r5, r2, r5
+; ARMV7-NEXT:    adcs r1, r4, r1
+; ARMV7-NEXT:    adc r4, r11, #0
+; ARMV7-NEXT:    umlal r1, r4, r3, lr
+; ARMV7-NEXT:    adds r2, r1, r7
+; ARMV7-NEXT:    adcs r3, r4, r6
+; ARMV7-NEXT:    adc r1, r11, #0
+; ARMV7-NEXT:    orr r1, r12, r1
+; ARMV7-NEXT:    b .LBB0_8
+; ARMV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV7-NEXT:    orrs r1, r9, r4
+; ARMV7-NEXT:    beq .LBB0_7
+; ARMV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV7-NEXT:    umull r1, r5, r2, r9
+; ARMV7-NEXT:    mov r6, #0
+; ARMV7-NEXT:    mov r11, #0
+; ARMV7-NEXT:    umlal r5, r6, r3, r9
+; ARMV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; ARMV7-NEXT:    umull r1, r8, r2, r4
+; ARMV7-NEXT:    adds r1, r1, r5
+; ARMV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    adcs r5, r6, r8
+; ARMV7-NEXT:    adc r6, r11, #0
+; ARMV7-NEXT:    umull r8, r11, r9, r10
+; ARMV7-NEXT:    mla r1, r9, r7, r11
+; ARMV7-NEXT:    umlal r5, r6, r3, r4
+; ARMV7-NEXT:    mla r1, r4, r10, r1
+; ARMV7-NEXT:    adds r4, r5, r8
+; ARMV7-NEXT:    umull r8, r5, r2, lr
+; ARMV7-NEXT:    adc r9, r6, r1
+; ARMV7-NEXT:    mov r6, #0
+; ARMV7-NEXT:    umlal r5, r6, r3, lr
+; ARMV7-NEXT:    umull r2, r1, r2, r12
+; ARMV7-NEXT:    adds r5, r2, r5
+; ARMV7-NEXT:    mov r2, #0
+; ARMV7-NEXT:    adcs r1, r6, r1
+; ARMV7-NEXT:    adc r2, r2, #0
+; ARMV7-NEXT:    umlal r1, r2, r3, r12
+; ARMV7-NEXT:    umull r3, r6, lr, r10
+; ARMV7-NEXT:    mla r7, lr, r7, r6
+; ARMV7-NEXT:    adds r1, r1, r3
+; ARMV7-NEXT:    mla r7, r12, r10, r7
+; ARMV7-NEXT:    adc r3, r2, r7
+; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    adds r2, r1, r2
+; ARMV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    adcs r3, r3, r1
+; ARMV7-NEXT:    adcs r1, r4, #0
+; ARMV7-NEXT:    adc r7, r9, #0
+; ARMV7-NEXT:    b .LBB0_6
+; ARMV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV7-NEXT:    umull r1, r5, lr, r10
+; ARMV7-NEXT:    mov r11, #0
+; ARMV7-NEXT:    umull r6, r8, lr, r7
+; ARMV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; ARMV7-NEXT:    mov r1, #0
+; ARMV7-NEXT:    umlal r5, r1, r12, r10
+; ARMV7-NEXT:    adds r5, r6, r5
+; ARMV7-NEXT:    str r5, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    adcs r1, r1, r8
+; ARMV7-NEXT:    adc r5, r11, #0
+; ARMV7-NEXT:    umull r8, r11, r10, r9
+; ARMV7-NEXT:    mla r6, r10, r4, r11
+; ARMV7-NEXT:    umlal r1, r5, r12, r7
+; ARMV7-NEXT:    mla r6, r7, r9, r6
+; ARMV7-NEXT:    mov r7, #0
+; ARMV7-NEXT:    adds r10, r1, r8
+; ARMV7-NEXT:    adc r11, r5, r6
+; ARMV7-NEXT:    umull r8, r5, lr, r2
+; ARMV7-NEXT:    umlal r5, r7, r12, r2
+; ARMV7-NEXT:    umull r1, r6, lr, r3
+; ARMV7-NEXT:    adds r5, r1, r5
+; ARMV7-NEXT:    adcs r1, r7, r6
+; ARMV7-NEXT:    mov r7, #0
+; ARMV7-NEXT:    adc r7, r7, #0
+; ARMV7-NEXT:    umlal r1, r7, r12, r3
+; ARMV7-NEXT:    umull r12, r6, r2, r9
+; ARMV7-NEXT:    mla r2, r2, r4, r6
+; ARMV7-NEXT:    adds r1, r1, r12
+; ARMV7-NEXT:    mla r2, r3, r9, r2
+; ARMV7-NEXT:    adc r3, r7, r2
+; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    adds r2, r1, r2
+; ARMV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    adcs r3, r3, r1
+; ARMV7-NEXT:    adcs r1, r10, #0
+; ARMV7-NEXT:    adc r7, r11, #0
+; ARMV7-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV7-NEXT:    orrs r1, r1, r7
+; ARMV7-NEXT:    movwne r1, #1
+; ARMV7-NEXT:    b .LBB0_8
+; ARMV7-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV7-NEXT:    umull r1, r11, r2, lr
+; ARMV7-NEXT:    mov r6, #0
+; ARMV7-NEXT:    umlal r11, r6, r3, lr
+; ARMV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    mov r1, #0
+; ARMV7-NEXT:    umull r5, r8, r2, r12
+; ARMV7-NEXT:    adds r5, r5, r11
+; ARMV7-NEXT:    adcs r6, r6, r8
+; ARMV7-NEXT:    adc r11, r1, #0
+; ARMV7-NEXT:    umlal r6, r11, r3, r12
+; ARMV7-NEXT:    umull r8, r12, lr, r10
+; ARMV7-NEXT:    str r6, [sp] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r6, [sp, #60]
+; ARMV7-NEXT:    mla r7, lr, r7, r12
+; ARMV7-NEXT:    str r8, [sp, #8] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r8, [sp, #4] @ 4-byte Reload
+; ARMV7-NEXT:    mla r12, r6, r10, r7
+; ARMV7-NEXT:    umull lr, r7, r9, r2
+; ARMV7-NEXT:    mla r3, r9, r3, r7
+; ARMV7-NEXT:    mla r2, r4, r2, r3
+; ARMV7-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
+; ARMV7-NEXT:    adds r3, lr, r3
+; ARMV7-NEXT:    adc r7, r2, r12
+; ARMV7-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; ARMV7-NEXT:    adds r2, r2, r3
+; ARMV7-NEXT:    adc r3, r11, r7
+; ARMV7-NEXT:  .LBB0_8: @ %overflow.res
+; ARMV7-NEXT:    str r8, [r0]
+; ARMV7-NEXT:    and r1, r1, #1
+; ARMV7-NEXT:    str r5, [r0, #4]
+; ARMV7-NEXT:    str r2, [r0, #8]
+; ARMV7-NEXT:    str r3, [r0, #12]
+; ARMV7-NEXT:    strb r1, [r0, #16]
+; ARMV7-NEXT:    add sp, sp, #12
 ; ARMV7-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
index 64d9831442970..91ea1a1ad75e9 100644
--- a/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
@@ -4,12 +4,18 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; ARMV6-LABEL: mulodi_test:
-; ARMV6:       @ %bb.0: @ %start
+; ARMV6:       @ %bb.0: @ %overflow.entry
 ; ARMV6-NEXT:    push {r4, r5, r11, lr}
-; ARMV6-NEXT:    umull r12, lr, r1, r2
-; ARMV6-NEXT:    umull r4, r5, r3, r0
-; ARMV6-NEXT:    cmp lr, #0
-; ARMV6-NEXT:    movne lr, #1
+; ARMV6-NEXT:    cmp r1, #0
+; ARMV6-NEXT:    beq .LBB0_3
+; ARMV6-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV6-NEXT:    cmp r3, #0
+; ARMV6-NEXT:    beq .LBB0_5
+; ARMV6-NEXT:  @ %bb.2: @ %overflow
+; ARMV6-NEXT:    umull r12, r4, r1, r2
+; ARMV6-NEXT:    umull lr, r5, r3, r0
+; ARMV6-NEXT:    cmp r4, #0
+; ARMV6-NEXT:    movne r4, #1
 ; ARMV6-NEXT:    cmp r3, #0
 ; ARMV6-NEXT:    movne r3, #1
 ; ARMV6-NEXT:    cmp r1, #0
@@ -17,38 +23,105 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    movne r1, #1
 ; ARMV6-NEXT:    and r1, r1, r3
 ; ARMV6-NEXT:    cmp r5, #0
-; ARMV6-NEXT:    orr r1, r1, lr
+; ARMV6-NEXT:    orr r1, r1, r4
 ; ARMV6-NEXT:    movne r5, #1
 ; ARMV6-NEXT:    orr r3, r1, r5
-; ARMV6-NEXT:    add r1, r12, r4
+; ARMV6-NEXT:    add r1, r12, lr
 ; ARMV6-NEXT:    adds r1, r2, r1
 ; ARMV6-NEXT:    mov r5, #0
 ; ARMV6-NEXT:    adc r2, r5, #0
-; ARMV6-NEXT:    orr r2, r3, r2
+; ARMV6-NEXT:    orr r12, r3, r2
+; ARMV6-NEXT:    and r2, r12, #1
+; ARMV6-NEXT:    pop {r4, r5, r11, pc}
+; ARMV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV6-NEXT:    cmp r3, #0
+; ARMV6-NEXT:    beq .LBB0_7
+; ARMV6-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV6-NEXT:    mov lr, r0
+; ARMV6-NEXT:    umull r0, r4, r0, r2
+; ARMV6-NEXT:    mov r12, r1
+; ARMV6-NEXT:    mla r1, r1, r2, r4
+; ARMV6-NEXT:    mul r12, r12, r3
+; ARMV6-NEXT:    umlal r1, r12, lr, r3
+; ARMV6-NEXT:    b .LBB0_6
+; ARMV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV6-NEXT:    mov r12, r0
+; ARMV6-NEXT:    umull r0, lr, r2, r0
+; ARMV6-NEXT:    mov r4, r1
+; ARMV6-NEXT:    mla r1, r3, r12, lr
+; ARMV6-NEXT:    mul r12, r3, r4
+; ARMV6-NEXT:    umlal r1, r12, r2, r4
+; ARMV6-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV6-NEXT:    cmp r12, #0
+; ARMV6-NEXT:    movne r12, #1
+; ARMV6-NEXT:    and r2, r12, #1
+; ARMV6-NEXT:    pop {r4, r5, r11, pc}
+; ARMV6-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV6-NEXT:    mov r12, r0
+; ARMV6-NEXT:    umull r0, r4, r0, r2
+; ARMV6-NEXT:    mla r3, r12, r3, r4
+; ARMV6-NEXT:    mov r12, #0
+; ARMV6-NEXT:    mla r1, r1, r2, r3
+; ARMV6-NEXT:    and r2, r12, #1
 ; ARMV6-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; ARMV7-LABEL: mulodi_test:
-; ARMV7:       @ %bb.0: @ %start
+; ARMV7:       @ %bb.0: @ %overflow.entry
 ; ARMV7-NEXT:    push {r4, r5, r11, lr}
-; ARMV7-NEXT:    umull r12, lr, r3, r0
+; ARMV7-NEXT:    cmp r1, #0
+; ARMV7-NEXT:    beq .LBB0_3
+; ARMV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; ARMV7-NEXT:    cmp r3, #0
+; ARMV7-NEXT:    beq .LBB0_5
+; ARMV7-NEXT:  @ %bb.2: @ %overflow
+; ARMV7-NEXT:    umull lr, r4, r3, r0
 ; ARMV7-NEXT:    cmp r3, #0
 ; ARMV7-NEXT:    movwne r3, #1
 ; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    umull r0, r4, r0, r2
+; ARMV7-NEXT:    umull r0, r12, r0, r2
 ; ARMV7-NEXT:    umull r2, r5, r1, r2
 ; ARMV7-NEXT:    movwne r1, #1
 ; ARMV7-NEXT:    and r1, r1, r3
 ; ARMV7-NEXT:    cmp r5, #0
 ; ARMV7-NEXT:    movwne r5, #1
-; ARMV7-NEXT:    cmp lr, #0
+; ARMV7-NEXT:    cmp r4, #0
 ; ARMV7-NEXT:    orr r1, r1, r5
-; ARMV7-NEXT:    movwne lr, #1
-; ARMV7-NEXT:    orr r3, r1, lr
-; ARMV7-NEXT:    add r1, r2, r12
+; ARMV7-NEXT:    movwne r4, #1
+; ARMV7-NEXT:    orr r3, r1, r4
+; ARMV7-NEXT:    add r1, r2, lr
 ; ARMV7-NEXT:    mov r2, #0
-; ARMV7-NEXT:    adds r1, r4, r1
+; ARMV7-NEXT:    adds r1, r12, r1
 ; ARMV7-NEXT:    adc r2, r2, #0
-; ARMV7-NEXT:    orr r2, r3, r2
+; ARMV7-NEXT:    orr r12, r3, r2
+; ARMV7-NEXT:    and r2, r12, #1
+; ARMV7-NEXT:    pop {r4, r5, r11, pc}
+; ARMV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; ARMV7-NEXT:    mov r5, r0
+; ARMV7-NEXT:    umull r0, r4, r0, r2
+; ARMV7-NEXT:    cmp r3, #0
+; ARMV7-NEXT:    beq .LBB0_7
+; ARMV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; ARMV7-NEXT:    mul r12, r1, r3
+; ARMV7-NEXT:    mla r1, r1, r2, r4
+; ARMV7-NEXT:    umlal r1, r12, r5, r3
+; ARMV7-NEXT:    b .LBB0_6
+; ARMV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; ARMV7-NEXT:    mov lr, r0
+; ARMV7-NEXT:    umull r0, r4, r2, r0
+; ARMV7-NEXT:    mov r5, r1
+; ARMV7-NEXT:    mul r12, r3, r1
+; ARMV7-NEXT:    mla r1, r3, lr, r4
+; ARMV7-NEXT:    umlal r1, r12, r2, r5
+; ARMV7-NEXT:  .LBB0_6: @ %overflow.res
+; ARMV7-NEXT:    cmp r12, #0
+; ARMV7-NEXT:    movwne r12, #1
+; ARMV7-NEXT:    and r2, r12, #1
+; ARMV7-NEXT:    pop {r4, r5, r11, pc}
+; ARMV7-NEXT:  .LBB0_7: @ %overflow.no
+; ARMV7-NEXT:    mla r3, r5, r3, r4
+; ARMV7-NEXT:    mov r12, #0
+; ARMV7-NEXT:    mla r1, r1, r2, r3
+; ARMV7-NEXT:    and r2, r12, #1
 ; ARMV7-NEXT:    pop {r4, r5, r11, pc}
 start:
   %0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %l, i64 %r) #2
diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
index 968c06136225d..5498a0741bc23 100644
--- a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
@@ -4,7 +4,13 @@
 
 define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; LA32-LABEL: smuloi64:
-; LA32:       # %bb.0:
+; LA32:       # %bb.0: # %overflow.entry
+; LA32-NEXT:    srai.w $a6, $a0, 31
+; LA32-NEXT:    srai.w $a5, $a2, 31
+; LA32-NEXT:    beq $a1, $a6, .LBB0_3
+; LA32-NEXT:  # %bb.1: # %overflow.lhs
+; LA32-NEXT:    beq $a3, $a5, .LBB0_6
+; LA32-NEXT:  # %bb.2: # %overflow
 ; LA32-NEXT:    mulh.wu $a5, $a0, $a2
 ; LA32-NEXT:    mul.w $a6, $a1, $a2
 ; LA32-NEXT:    add.w $a5, $a6, $a5
@@ -38,11 +44,138 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; LA32-NEXT:    xor $a1, $a1, $a6
 ; LA32-NEXT:    xor $a3, $a3, $a6
 ; LA32-NEXT:    or $a1, $a3, $a1
-; LA32-NEXT:    sltu $a1, $zero, $a1
+; LA32-NEXT:    sltu $a6, $zero, $a1
+; LA32-NEXT:    b .LBB0_9
+; LA32-NEXT:  .LBB0_3: # %overflow.no.lhs
+; LA32-NEXT:    beq $a3, $a5, .LBB0_8
+; LA32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a1, .LBB0_10
+; LA32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a5, $a0
+; LA32-NEXT:    move $a6, $a1
+; LA32-NEXT:    bgez $a1, .LBB0_11
+; LA32-NEXT:    b .LBB0_12
+; LA32-NEXT:  .LBB0_6: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB0_14
+; LA32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a5, $a2
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    bgez $a3, .LBB0_15
+; LA32-NEXT:    b .LBB0_16
+; LA32-NEXT:  .LBB0_8: # %overflow.no
+; LA32-NEXT:    move $a6, $zero
+; LA32-NEXT:    mulh.wu $a5, $a0, $a2
+; LA32-NEXT:    mul.w $a3, $a0, $a3
+; LA32-NEXT:    add.w $a3, $a5, $a3
+; LA32-NEXT:    mul.w $a1, $a1, $a2
+; LA32-NEXT:    add.w $a5, $a3, $a1
+; LA32-NEXT:  .LBB0_9: # %overflow.res
 ; LA32-NEXT:    mul.w $a0, $a0, $a2
+; LA32-NEXT:    b .LBB0_27
+; LA32-NEXT:  .LBB0_10:
+; LA32-NEXT:    sub.w $a5, $zero, $a0
+; LA32-NEXT:    sltu $a6, $zero, $a0
+; LA32-NEXT:    add.w $a6, $a1, $a6
+; LA32-NEXT:    sub.w $a6, $zero, $a6
+; LA32-NEXT:    bltz $a1, .LBB0_12
+; LA32-NEXT:  .LBB0_11: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a6, $a1
+; LA32-NEXT:    move $a5, $a0
+; LA32-NEXT:  .LBB0_12: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB0_18
+; LA32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a7, $a2
+; LA32-NEXT:    move $a0, $a3
+; LA32-NEXT:    b .LBB0_19
+; LA32-NEXT:  .LBB0_14:
+; LA32-NEXT:    sub.w $a5, $zero, $a2
+; LA32-NEXT:    sltu $a6, $zero, $a2
+; LA32-NEXT:    add.w $a6, $a3, $a6
+; LA32-NEXT:    sub.w $a6, $zero, $a6
+; LA32-NEXT:    bltz $a3, .LBB0_16
+; LA32-NEXT:  .LBB0_15: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    move $a5, $a2
+; LA32-NEXT:  .LBB0_16: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a1, .LBB0_22
+; LA32-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a7, $a0
+; LA32-NEXT:    move $a2, $a1
+; LA32-NEXT:    b .LBB0_23
+; LA32-NEXT:  .LBB0_18:
+; LA32-NEXT:    sub.w $a7, $zero, $a2
+; LA32-NEXT:    sltu $a0, $zero, $a2
+; LA32-NEXT:    add.w $a0, $a3, $a0
+; LA32-NEXT:    sub.w $a0, $zero, $a0
+; LA32-NEXT:  .LBB0_19: # %overflow.no.lhs.only
+; LA32-NEXT:    slti $a1, $a1, 0
+; LA32-NEXT:    slti $t0, $a3, 0
+; LA32-NEXT:    bltz $a3, .LBB0_21
+; LA32-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a0, $a3
+; LA32-NEXT:    move $a7, $a2
+; LA32-NEXT:  .LBB0_21: # %overflow.no.lhs.only
+; LA32-NEXT:    mulh.wu $a2, $a5, $a7
+; LA32-NEXT:    mul.w $a3, $a6, $a7
+; LA32-NEXT:    add.w $a2, $a2, $a3
+; LA32-NEXT:    mul.w $a3, $a5, $a7
+; LA32-NEXT:    mul.w $a6, $a6, $a0
+; LA32-NEXT:    mulh.wu $a7, $a5, $a0
+; LA32-NEXT:    add.w $a6, $a7, $a6
+; LA32-NEXT:    mul.w $a0, $a5, $a0
+; LA32-NEXT:    add.w $a5, $a2, $a0
+; LA32-NEXT:    sltu $a0, $a5, $a2
+; LA32-NEXT:    add.w $a2, $a6, $a0
+; LA32-NEXT:    xor $a1, $t0, $a1
+; LA32-NEXT:    sub.w $a6, $zero, $a1
+; LA32-NEXT:    xor $a0, $a3, $a6
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    sltu $a1, $a0, $a1
+; LA32-NEXT:    xor $a3, $a5, $a6
+; LA32-NEXT:    add.w $a5, $a3, $a1
+; LA32-NEXT:    sltu $a1, $a5, $a1
+; LA32-NEXT:    xor $a2, $a2, $a6
+; LA32-NEXT:    b .LBB0_26
+; LA32-NEXT:  .LBB0_22:
+; LA32-NEXT:    sub.w $a7, $zero, $a0
+; LA32-NEXT:    sltu $a2, $zero, $a0
+; LA32-NEXT:    add.w $a2, $a1, $a2
+; LA32-NEXT:    sub.w $a2, $zero, $a2
+; LA32-NEXT:  .LBB0_23: # %overflow.no.rhs.only
+; LA32-NEXT:    slti $a3, $a3, 0
+; LA32-NEXT:    slti $t0, $a1, 0
+; LA32-NEXT:    bltz $a1, .LBB0_25
+; LA32-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a2, $a1
+; LA32-NEXT:    move $a7, $a0
+; LA32-NEXT:  .LBB0_25: # %overflow.no.rhs.only
+; LA32-NEXT:    mulh.wu $a0, $a5, $a7
+; LA32-NEXT:    mul.w $a1, $a6, $a7
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    mul.w $a1, $a5, $a7
+; LA32-NEXT:    mul.w $a6, $a6, $a2
+; LA32-NEXT:    mulh.wu $a7, $a5, $a2
+; LA32-NEXT:    add.w $a6, $a7, $a6
+; LA32-NEXT:    mul.w $a2, $a5, $a2
+; LA32-NEXT:    add.w $a2, $a0, $a2
+; LA32-NEXT:    sltu $a0, $a2, $a0
+; LA32-NEXT:    add.w $a6, $a6, $a0
+; LA32-NEXT:    xor $a3, $a3, $t0
+; LA32-NEXT:    sub.w $a7, $zero, $a3
+; LA32-NEXT:    xor $a0, $a1, $a7
+; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    sltu $a1, $a0, $a3
+; LA32-NEXT:    xor $a2, $a2, $a7
+; LA32-NEXT:    add.w $a5, $a2, $a1
+; LA32-NEXT:    sltu $a1, $a5, $a1
+; LA32-NEXT:    xor $a2, $a6, $a7
+; LA32-NEXT:  .LBB0_26: # %overflow.res
+; LA32-NEXT:    add.w $a1, $a2, $a1
+; LA32-NEXT:    sltu $a6, $zero, $a1
+; LA32-NEXT:  .LBB0_27: # %overflow.res
 ; LA32-NEXT:    st.w $a0, $a4, 0
+; LA32-NEXT:    andi $a0, $a6, 1
 ; LA32-NEXT:    st.w $a5, $a4, 4
-; LA32-NEXT:    move $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi64:
@@ -63,7 +196,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 
 define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-LABEL: smuloi128:
-; LA32:       # %bb.0:
+; LA32:       # %bb.0: # %overflow.entry
 ; LA32-NEXT:    addi.w $sp, $sp, -48
 ; LA32-NEXT:    .cfi_def_cfa_offset 48
 ; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
@@ -88,198 +221,608 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-NEXT:    .cfi_offset 29, -36
 ; LA32-NEXT:    .cfi_offset 30, -40
 ; LA32-NEXT:    .cfi_offset 31, -44
-; LA32-NEXT:    ld.w $a5, $a1, 12
-; LA32-NEXT:    ld.w $a6, $a1, 8
-; LA32-NEXT:    ld.w $t1, $a0, 4
-; LA32-NEXT:    ld.w $a3, $a1, 0
-; LA32-NEXT:    ld.w $a7, $a0, 8
-; LA32-NEXT:    ld.w $t0, $a0, 12
-; LA32-NEXT:    ld.w $a4, $a0, 0
-; LA32-NEXT:    ld.w $t4, $a1, 4
-; LA32-NEXT:    mulh.wu $a0, $a7, $a3
-; LA32-NEXT:    mul.w $a1, $t0, $a3
-; LA32-NEXT:    add.w $a0, $a1, $a0
-; LA32-NEXT:    sltu $a1, $a0, $a1
-; LA32-NEXT:    mulh.wu $t2, $t0, $a3
-; LA32-NEXT:    add.w $a1, $t2, $a1
-; LA32-NEXT:    mul.w $t3, $a7, $t4
-; LA32-NEXT:    add.w $t2, $t3, $a0
-; LA32-NEXT:    sltu $a0, $t2, $t3
-; LA32-NEXT:    mulh.wu $t3, $a7, $t4
-; LA32-NEXT:    add.w $a0, $t3, $a0
-; LA32-NEXT:    add.w $t5, $a1, $a0
-; LA32-NEXT:    mul.w $t6, $t0, $t4
-; LA32-NEXT:    add.w $t7, $t6, $t5
-; LA32-NEXT:    srai.w $a0, $t0, 31
-; LA32-NEXT:    mul.w $t8, $a3, $a0
-; LA32-NEXT:    add.w $t3, $t7, $t8
-; LA32-NEXT:    sltu $fp, $t3, $t7
+; LA32-NEXT:    ld.w $a3, $a1, 12
+; LA32-NEXT:    ld.w $a7, $a1, 8
+; LA32-NEXT:    ld.w $a5, $a1, 0
+; LA32-NEXT:    ld.w $a6, $a0, 0
+; LA32-NEXT:    ld.w $t0, $a0, 4
+; LA32-NEXT:    ld.w $a4, $a0, 12
+; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    ld.w $a1, $a1, 4
+; LA32-NEXT:    srai.w $t1, $t0, 31
+; LA32-NEXT:    xor $t2, $a4, $t1
+; LA32-NEXT:    xor $t1, $a0, $t1
+; LA32-NEXT:    or $t2, $t1, $t2
+; LA32-NEXT:    srai.w $t1, $a1, 31
+; LA32-NEXT:    beq $t2, $zero, .LBB1_11
+; LA32-NEXT:  # %bb.1: # %overflow.lhs
+; LA32-NEXT:    xor $t2, $a7, $t1
+; LA32-NEXT:    xor $t1, $a3, $t1
+; LA32-NEXT:    or $t1, $t2, $t1
+; LA32-NEXT:    beq $t1, $zero, .LBB1_14
+; LA32-NEXT:  # %bb.2: # %overflow
+; LA32-NEXT:    mulh.wu $t1, $a0, $a5
+; LA32-NEXT:    mul.w $t2, $a4, $a5
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    sltu $t2, $t1, $t2
+; LA32-NEXT:    mulh.wu $t3, $a4, $a5
+; LA32-NEXT:    add.w $t5, $t3, $t2
+; LA32-NEXT:    mul.w $t3, $a0, $a1
+; LA32-NEXT:    add.w $t2, $t3, $t1
+; LA32-NEXT:    sltu $t1, $t2, $t3
+; LA32-NEXT:    mulh.wu $t3, $a0, $a1
+; LA32-NEXT:    add.w $t1, $t3, $t1
+; LA32-NEXT:    add.w $t1, $t5, $t1
+; LA32-NEXT:    mul.w $t6, $a4, $a1
+; LA32-NEXT:    add.w $t7, $t6, $t1
+; LA32-NEXT:    srai.w $t3, $a4, 31
+; LA32-NEXT:    mul.w $t8, $a5, $t3
+; LA32-NEXT:    add.w $t4, $t7, $t8
+; LA32-NEXT:    sltu $fp, $t4, $t7
 ; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $a1, $t5, $a1
-; LA32-NEXT:    mulh.wu $t5, $t0, $t4
-; LA32-NEXT:    add.w $a1, $t5, $a1
-; LA32-NEXT:    add.w $a1, $a1, $t6
-; LA32-NEXT:    mulh.wu $t5, $a3, $a0
+; LA32-NEXT:    sltu $t1, $t1, $t5
+; LA32-NEXT:    mulh.wu $t5, $a4, $a1
+; LA32-NEXT:    add.w $t1, $t5, $t1
+; LA32-NEXT:    add.w $t1, $t1, $t6
+; LA32-NEXT:    mulh.wu $t5, $a5, $t3
 ; LA32-NEXT:    add.w $t5, $t5, $t8
-; LA32-NEXT:    mul.w $t6, $t4, $a0
+; LA32-NEXT:    mul.w $t6, $a1, $t3
 ; LA32-NEXT:    add.w $t5, $t5, $t6
-; LA32-NEXT:    add.w $t8, $a1, $t5
-; LA32-NEXT:    mulh.wu $a1, $a4, $a3
-; LA32-NEXT:    mul.w $t5, $t1, $a3
-; LA32-NEXT:    add.w $a1, $t5, $a1
-; LA32-NEXT:    sltu $t5, $a1, $t5
-; LA32-NEXT:    mulh.wu $t6, $t1, $a3
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    mul.w $t6, $a4, $t4
-; LA32-NEXT:    add.w $a1, $t6, $a1
-; LA32-NEXT:    sltu $t6, $a1, $t6
-; LA32-NEXT:    mulh.wu $t7, $a4, $t4
-; LA32-NEXT:    add.w $t6, $t7, $t6
-; LA32-NEXT:    add.w $t6, $t5, $t6
-; LA32-NEXT:    mul.w $t7, $t1, $t4
-; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    add.w $t5, $t1, $t5
+; LA32-NEXT:    mulh.wu $t1, $a6, $a5
+; LA32-NEXT:    mul.w $t6, $t0, $a5
+; LA32-NEXT:    add.w $t1, $t6, $t1
+; LA32-NEXT:    sltu $t6, $t1, $t6
+; LA32-NEXT:    mulh.wu $t7, $t0, $a5
 ; LA32-NEXT:    add.w $t6, $t7, $t6
-; LA32-NEXT:    sltu $t7, $t6, $t7
-; LA32-NEXT:    mulh.wu $t4, $t1, $t4
-; LA32-NEXT:    add.w $t4, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t4, $t7
-; LA32-NEXT:    add.w $t4, $t2, $t4
-; LA32-NEXT:    mul.w $t5, $a7, $a3
-; LA32-NEXT:    add.w $t6, $t5, $t6
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    add.w $t7, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t8, $fp
-; LA32-NEXT:    beq $t7, $t2, .LBB1_2
-; LA32-NEXT:  # %bb.1:
-; LA32-NEXT:    sltu $t5, $t7, $t2
-; LA32-NEXT:  .LBB1_2:
-; LA32-NEXT:    add.w $t5, $t3, $t5
-; LA32-NEXT:    sltu $t2, $t5, $t3
-; LA32-NEXT:    add.w $t4, $t4, $t2
-; LA32-NEXT:    mulh.wu $t2, $a4, $a6
-; LA32-NEXT:    mul.w $t3, $t1, $a6
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    sltu $t3, $t2, $t3
-; LA32-NEXT:    mulh.wu $t8, $t1, $a6
-; LA32-NEXT:    add.w $s0, $t8, $t3
-; LA32-NEXT:    mul.w $t3, $a4, $a5
-; LA32-NEXT:    add.w $t8, $t3, $t2
-; LA32-NEXT:    sltu $t2, $t8, $t3
-; LA32-NEXT:    mulh.wu $t3, $a4, $a5
-; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    mul.w $t7, $a6, $a1
+; LA32-NEXT:    add.w $t1, $t7, $t1
+; LA32-NEXT:    sltu $t7, $t1, $t7
+; LA32-NEXT:    mulh.wu $t8, $a6, $a1
+; LA32-NEXT:    add.w $t7, $t8, $t7
+; LA32-NEXT:    add.w $t7, $t6, $t7
+; LA32-NEXT:    mul.w $t8, $t0, $a1
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    add.w $t7, $t8, $t7
+; LA32-NEXT:    sltu $t8, $t7, $t8
+; LA32-NEXT:    mulh.wu $a1, $t0, $a1
+; LA32-NEXT:    add.w $a1, $a1, $t6
+; LA32-NEXT:    add.w $a1, $a1, $t8
+; LA32-NEXT:    add.w $t8, $t2, $a1
+; LA32-NEXT:    mul.w $t6, $a0, $a5
+; LA32-NEXT:    add.w $a1, $t6, $t7
+; LA32-NEXT:    sltu $t6, $a1, $t6
+; LA32-NEXT:    add.w $t7, $t8, $t6
+; LA32-NEXT:    add.w $t5, $t5, $fp
+; LA32-NEXT:    beq $t7, $t2, .LBB1_4
+; LA32-NEXT:  # %bb.3: # %overflow
+; LA32-NEXT:    sltu $t6, $t7, $t2
+; LA32-NEXT:  .LBB1_4: # %overflow
+; LA32-NEXT:    add.w $t6, $t4, $t6
+; LA32-NEXT:    sltu $t2, $t6, $t4
+; LA32-NEXT:    add.w $t5, $t5, $t2
+; LA32-NEXT:    mulh.wu $t2, $a6, $a7
+; LA32-NEXT:    mul.w $t4, $t0, $a7
+; LA32-NEXT:    add.w $t2, $t4, $t2
+; LA32-NEXT:    sltu $t4, $t2, $t4
+; LA32-NEXT:    mulh.wu $t8, $t0, $a7
+; LA32-NEXT:    add.w $s0, $t8, $t4
+; LA32-NEXT:    mul.w $t4, $a6, $a3
+; LA32-NEXT:    add.w $t8, $t4, $t2
+; LA32-NEXT:    sltu $t2, $t8, $t4
+; LA32-NEXT:    mulh.wu $t4, $a6, $a3
+; LA32-NEXT:    add.w $t2, $t4, $t2
 ; LA32-NEXT:    add.w $t2, $s0, $t2
-; LA32-NEXT:    mul.w $s1, $t1, $a5
+; LA32-NEXT:    mul.w $s1, $t0, $a3
 ; LA32-NEXT:    add.w $s2, $s1, $t2
-; LA32-NEXT:    srai.w $t3, $a5, 31
-; LA32-NEXT:    mul.w $s3, $t3, $a4
+; LA32-NEXT:    srai.w $t4, $a3, 31
+; LA32-NEXT:    mul.w $s3, $t4, $a6
 ; LA32-NEXT:    add.w $fp, $s2, $s3
 ; LA32-NEXT:    sltu $s4, $fp, $s2
 ; LA32-NEXT:    sltu $s1, $s2, $s1
 ; LA32-NEXT:    sltu $t2, $t2, $s0
-; LA32-NEXT:    mulh.wu $s0, $t1, $a5
+; LA32-NEXT:    mulh.wu $s0, $t0, $a3
 ; LA32-NEXT:    add.w $t2, $s0, $t2
 ; LA32-NEXT:    add.w $t2, $t2, $s1
-; LA32-NEXT:    mul.w $t1, $t3, $t1
-; LA32-NEXT:    mulh.wu $s0, $t3, $a4
-; LA32-NEXT:    add.w $t1, $s0, $t1
-; LA32-NEXT:    add.w $t1, $t1, $s3
-; LA32-NEXT:    add.w $s0, $t2, $t1
-; LA32-NEXT:    add.w $t2, $t8, $t7
-; LA32-NEXT:    mul.w $t7, $a4, $a6
-; LA32-NEXT:    add.w $t1, $t7, $t6
-; LA32-NEXT:    sltu $t7, $t1, $t7
-; LA32-NEXT:    add.w $t2, $t2, $t7
-; LA32-NEXT:    add.w $t6, $s0, $s4
-; LA32-NEXT:    beq $t2, $t8, .LBB1_4
-; LA32-NEXT:  # %bb.3:
-; LA32-NEXT:    sltu $t7, $t2, $t8
-; LA32-NEXT:  .LBB1_4:
+; LA32-NEXT:    mul.w $t0, $t4, $t0
+; LA32-NEXT:    mulh.wu $s0, $t4, $a6
+; LA32-NEXT:    add.w $t0, $s0, $t0
+; LA32-NEXT:    add.w $t0, $t0, $s3
+; LA32-NEXT:    add.w $t0, $t2, $t0
+; LA32-NEXT:    add.w $s0, $t8, $t7
+; LA32-NEXT:    mul.w $t7, $a6, $a7
+; LA32-NEXT:    add.w $t2, $t7, $a1
+; LA32-NEXT:    sltu $t7, $t2, $t7
+; LA32-NEXT:    add.w $a1, $s0, $t7
+; LA32-NEXT:    add.w $t0, $t0, $s4
+; LA32-NEXT:    beq $a1, $t8, .LBB1_6
+; LA32-NEXT:  # %bb.5: # %overflow
+; LA32-NEXT:    sltu $t7, $a1, $t8
+; LA32-NEXT:  .LBB1_6: # %overflow
 ; LA32-NEXT:    add.w $t7, $fp, $t7
 ; LA32-NEXT:    sltu $t8, $t7, $fp
-; LA32-NEXT:    add.w $t8, $t6, $t8
-; LA32-NEXT:    add.w $t6, $t4, $t8
-; LA32-NEXT:    add.w $t7, $t5, $t7
-; LA32-NEXT:    sltu $s0, $t7, $t5
-; LA32-NEXT:    add.w $s4, $t6, $s0
-; LA32-NEXT:    mulh.wu $t5, $a7, $a6
-; LA32-NEXT:    mul.w $s1, $t0, $a6
-; LA32-NEXT:    add.w $s3, $s1, $t5
-; LA32-NEXT:    mul.w $fp, $a7, $a5
+; LA32-NEXT:    add.w $t8, $t0, $t8
+; LA32-NEXT:    add.w $t0, $t5, $t8
+; LA32-NEXT:    add.w $t7, $t6, $t7
+; LA32-NEXT:    sltu $s0, $t7, $t6
+; LA32-NEXT:    add.w $s4, $t0, $s0
+; LA32-NEXT:    mulh.wu $t0, $a0, $a7
+; LA32-NEXT:    mul.w $s1, $a4, $a7
+; LA32-NEXT:    add.w $s3, $s1, $t0
+; LA32-NEXT:    mul.w $fp, $a0, $a3
 ; LA32-NEXT:    add.w $s2, $fp, $s3
 ; LA32-NEXT:    add.w $t6, $s2, $s4
-; LA32-NEXT:    mul.w $s5, $a7, $a6
-; LA32-NEXT:    add.w $t5, $s5, $t7
-; LA32-NEXT:    sltu $t7, $t5, $s5
+; LA32-NEXT:    mul.w $s5, $a0, $a7
+; LA32-NEXT:    add.w $t0, $s5, $t7
+; LA32-NEXT:    sltu $t7, $t0, $s5
 ; LA32-NEXT:    add.w $t6, $t6, $t7
-; LA32-NEXT:    beq $t6, $s2, .LBB1_6
-; LA32-NEXT:  # %bb.5:
+; LA32-NEXT:    beq $t6, $s2, .LBB1_8
+; LA32-NEXT:  # %bb.7: # %overflow
 ; LA32-NEXT:    sltu $t7, $t6, $s2
-; LA32-NEXT:  .LBB1_6:
-; LA32-NEXT:    beq $s4, $t4, .LBB1_8
-; LA32-NEXT:  # %bb.7:
-; LA32-NEXT:    sltu $s0, $s4, $t4
-; LA32-NEXT:  .LBB1_8:
-; LA32-NEXT:    srai.w $t4, $t4, 31
+; LA32-NEXT:  .LBB1_8: # %overflow
+; LA32-NEXT:    beq $s4, $t5, .LBB1_10
+; LA32-NEXT:  # %bb.9: # %overflow
+; LA32-NEXT:    sltu $s0, $s4, $t5
+; LA32-NEXT:  .LBB1_10: # %overflow
+; LA32-NEXT:    srai.w $t5, $t5, 31
 ; LA32-NEXT:    srai.w $t8, $t8, 31
-; LA32-NEXT:    add.w $t8, $t4, $t8
+; LA32-NEXT:    add.w $t8, $t5, $t8
 ; LA32-NEXT:    add.w $s0, $t8, $s0
 ; LA32-NEXT:    sltu $s1, $s3, $s1
-; LA32-NEXT:    mulh.wu $s3, $t0, $a6
+; LA32-NEXT:    mulh.wu $s3, $a4, $a7
 ; LA32-NEXT:    add.w $s1, $s3, $s1
 ; LA32-NEXT:    sltu $fp, $s2, $fp
-; LA32-NEXT:    mulh.wu $s2, $a7, $a5
+; LA32-NEXT:    mulh.wu $s2, $a0, $a3
 ; LA32-NEXT:    add.w $fp, $s2, $fp
 ; LA32-NEXT:    add.w $fp, $s1, $fp
-; LA32-NEXT:    mul.w $s2, $t0, $a5
+; LA32-NEXT:    mul.w $s2, $a4, $a3
 ; LA32-NEXT:    add.w $s3, $s2, $fp
-; LA32-NEXT:    mul.w $s4, $a6, $a0
-; LA32-NEXT:    mul.w $s5, $t3, $a7
+; LA32-NEXT:    mul.w $s4, $a7, $t3
+; LA32-NEXT:    mul.w $s5, $t4, $a0
 ; LA32-NEXT:    add.w $s6, $s5, $s4
 ; LA32-NEXT:    add.w $s7, $s3, $s6
 ; LA32-NEXT:    add.w $s8, $s7, $s0
 ; LA32-NEXT:    add.w $t7, $s8, $t7
 ; LA32-NEXT:    sltu $ra, $t7, $s8
-; LA32-NEXT:    sltu $t4, $t8, $t4
-; LA32-NEXT:    add.w $t4, $t8, $t4
+; LA32-NEXT:    sltu $t5, $t8, $t5
+; LA32-NEXT:    add.w $t5, $t8, $t5
 ; LA32-NEXT:    sltu $t8, $s0, $t8
-; LA32-NEXT:    add.w $t4, $t4, $t8
+; LA32-NEXT:    add.w $t5, $t5, $t8
 ; LA32-NEXT:    sltu $t8, $s7, $s3
 ; LA32-NEXT:    sltu $s0, $s3, $s2
 ; LA32-NEXT:    sltu $fp, $fp, $s1
-; LA32-NEXT:    mulh.wu $s1, $t0, $a5
+; LA32-NEXT:    mulh.wu $s1, $a4, $a3
 ; LA32-NEXT:    add.w $fp, $s1, $fp
 ; LA32-NEXT:    add.w $fp, $fp, $s0
-; LA32-NEXT:    mulh.wu $a6, $a6, $a0
-; LA32-NEXT:    add.w $a6, $a6, $s4
-; LA32-NEXT:    mul.w $a0, $a5, $a0
-; LA32-NEXT:    add.w $a0, $a6, $a0
-; LA32-NEXT:    mul.w $a5, $t3, $t0
-; LA32-NEXT:    mulh.wu $a6, $t3, $a7
-; LA32-NEXT:    add.w $a5, $a6, $a5
-; LA32-NEXT:    add.w $a5, $a5, $s5
-; LA32-NEXT:    add.w $a0, $a5, $a0
-; LA32-NEXT:    sltu $a5, $s6, $s5
-; LA32-NEXT:    add.w $a0, $a0, $a5
+; LA32-NEXT:    mulh.wu $a7, $a7, $t3
+; LA32-NEXT:    add.w $a7, $a7, $s4
+; LA32-NEXT:    mul.w $a3, $a3, $t3
+; LA32-NEXT:    add.w $a3, $a7, $a3
+; LA32-NEXT:    mul.w $a4, $t4, $a4
+; LA32-NEXT:    mulh.wu $a0, $t4, $a0
+; LA32-NEXT:    add.w $a0, $a0, $a4
+; LA32-NEXT:    add.w $a0, $a0, $s5
+; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    sltu $a3, $s6, $s5
+; LA32-NEXT:    add.w $a0, $a0, $a3
 ; LA32-NEXT:    add.w $a0, $fp, $a0
 ; LA32-NEXT:    add.w $a0, $a0, $t8
-; LA32-NEXT:    add.w $a0, $a0, $t4
-; LA32-NEXT:    sltu $a5, $s8, $s7
-; LA32-NEXT:    add.w $a0, $a0, $a5
+; LA32-NEXT:    add.w $a0, $a0, $t5
+; LA32-NEXT:    sltu $a3, $s8, $s7
+; LA32-NEXT:    add.w $a0, $a0, $a3
 ; LA32-NEXT:    add.w $a0, $a0, $ra
-; LA32-NEXT:    srai.w $a5, $t2, 31
-; LA32-NEXT:    xor $a0, $a0, $a5
-; LA32-NEXT:    xor $a6, $t6, $a5
-; LA32-NEXT:    or $a0, $a6, $a0
-; LA32-NEXT:    xor $a6, $t7, $a5
-; LA32-NEXT:    xor $a5, $t5, $a5
-; LA32-NEXT:    or $a5, $a5, $a6
-; LA32-NEXT:    or $a0, $a5, $a0
-; LA32-NEXT:    sltu $a0, $zero, $a0
-; LA32-NEXT:    mul.w $a3, $a4, $a3
-; LA32-NEXT:    st.w $a3, $a2, 0
-; LA32-NEXT:    st.w $a1, $a2, 4
-; LA32-NEXT:    st.w $t1, $a2, 8
-; LA32-NEXT:    st.w $t2, $a2, 12
+; LA32-NEXT:    srai.w $a3, $a1, 31
+; LA32-NEXT:    xor $a0, $a0, $a3
+; LA32-NEXT:    xor $a4, $t6, $a3
+; LA32-NEXT:    or $a0, $a4, $a0
+; LA32-NEXT:    xor $a4, $t7, $a3
+; LA32-NEXT:    xor $a3, $t0, $a3
+; LA32-NEXT:    or $a3, $a3, $a4
+; LA32-NEXT:    or $a0, $a3, $a0
+; LA32-NEXT:    sltu $t3, $zero, $a0
+; LA32-NEXT:    b .LBB1_17
+; LA32-NEXT:  .LBB1_11: # %overflow.no.lhs
+; LA32-NEXT:    xor $t2, $a7, $t1
+; LA32-NEXT:    xor $t1, $a3, $t1
+; LA32-NEXT:    or $t1, $t2, $t1
+; LA32-NEXT:    beq $t1, $zero, .LBB1_16
+; LA32-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_18
+; LA32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t1, $a0
+; LA32-NEXT:    move $t3, $a4
+; LA32-NEXT:    move $t2, $a6
+; LA32-NEXT:    move $t4, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_19
+; LA32-NEXT:    b .LBB1_20
+; LA32-NEXT:  .LBB1_14: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_35
+; LA32-NEXT:  # %bb.15: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t1, $a7
+; LA32-NEXT:    move $t3, $a3
+; LA32-NEXT:    move $t2, $a5
+; LA32-NEXT:    move $t4, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_36
+; LA32-NEXT:    b .LBB1_37
+; LA32-NEXT:  .LBB1_16: # %overflow.no
+; LA32-NEXT:    move $t3, $zero
+; LA32-NEXT:    mulh.wu $t1, $a6, $a5
+; LA32-NEXT:    mul.w $t2, $t0, $a5
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    sltu $t2, $t1, $t2
+; LA32-NEXT:    mulh.wu $t4, $t0, $a5
+; LA32-NEXT:    add.w $t4, $t4, $t2
+; LA32-NEXT:    mul.w $t2, $a6, $a1
+; LA32-NEXT:    add.w $t1, $t2, $t1
+; LA32-NEXT:    sltu $t2, $t1, $t2
+; LA32-NEXT:    mulh.wu $t5, $a6, $a1
+; LA32-NEXT:    add.w $t2, $t5, $t2
+; LA32-NEXT:    add.w $t5, $t4, $t2
+; LA32-NEXT:    mul.w $t6, $t0, $a1
+; LA32-NEXT:    add.w $t7, $t6, $t5
+; LA32-NEXT:    mul.w $t2, $a5, $a0
+; LA32-NEXT:    mul.w $t8, $a7, $a6
+; LA32-NEXT:    add.w $fp, $t8, $t2
+; LA32-NEXT:    add.w $t2, $t7, $fp
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    sltu $t7, $t2, $t7
+; LA32-NEXT:    sltu $t4, $t5, $t4
+; LA32-NEXT:    mulh.wu $t5, $t0, $a1
+; LA32-NEXT:    add.w $t4, $t5, $t4
+; LA32-NEXT:    add.w $t4, $t4, $t6
+; LA32-NEXT:    mul.w $t0, $a7, $t0
+; LA32-NEXT:    mulh.wu $a7, $a7, $a6
+; LA32-NEXT:    add.w $a7, $a7, $t0
+; LA32-NEXT:    mul.w $a3, $a3, $a6
+; LA32-NEXT:    add.w $a3, $a7, $a3
+; LA32-NEXT:    mulh.wu $a7, $a5, $a0
+; LA32-NEXT:    mul.w $a4, $a5, $a4
+; LA32-NEXT:    add.w $a4, $a7, $a4
+; LA32-NEXT:    mul.w $a0, $a1, $a0
+; LA32-NEXT:    add.w $a0, $a4, $a0
+; LA32-NEXT:    add.w $a0, $a3, $a0
+; LA32-NEXT:    sltu $a1, $fp, $t8
+; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    add.w $a0, $t4, $a0
+; LA32-NEXT:    add.w $a1, $a0, $t7
+; LA32-NEXT:  .LBB1_17: # %overflow.res
+; LA32-NEXT:    mul.w $a0, $a6, $a5
+; LA32-NEXT:    b .LBB1_53
+; LA32-NEXT:  .LBB1_18:
+; LA32-NEXT:    sub.w $t2, $zero, $a0
+; LA32-NEXT:    or $t1, $a6, $t0
+; LA32-NEXT:    sltu $t3, $zero, $t1
+; LA32-NEXT:    sub.w $t1, $t2, $t3
+; LA32-NEXT:    sltu $t2, $t2, $t3
+; LA32-NEXT:    sltu $t3, $zero, $a0
+; LA32-NEXT:    add.w $t3, $a4, $t3
+; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    sub.w $t3, $zero, $t2
+; LA32-NEXT:    sub.w $t2, $zero, $a6
+; LA32-NEXT:    sltu $t4, $zero, $a6
+; LA32-NEXT:    add.w $t4, $t0, $t4
+; LA32-NEXT:    sub.w $t4, $zero, $t4
+; LA32-NEXT:    bltz $a4, .LBB1_20
+; LA32-NEXT:  .LBB1_19: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t3, $a4
+; LA32-NEXT:    move $t1, $a0
+; LA32-NEXT:  .LBB1_20: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_24
+; LA32-NEXT:  # %bb.21: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t4, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_25
+; LA32-NEXT:  .LBB1_22: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_26
+; LA32-NEXT:  .LBB1_23: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a0, $a7
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    move $t0, $a5
+; LA32-NEXT:    move $t5, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_27
+; LA32-NEXT:    b .LBB1_28
+; LA32-NEXT:  .LBB1_24: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_22
+; LA32-NEXT:  .LBB1_25: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t2, $a6
+; LA32-NEXT:    bgez $a3, .LBB1_23
+; LA32-NEXT:  .LBB1_26:
+; LA32-NEXT:    sub.w $a6, $zero, $a7
+; LA32-NEXT:    or $a0, $a5, $a1
+; LA32-NEXT:    sltu $t0, $zero, $a0
+; LA32-NEXT:    sub.w $a0, $a6, $t0
+; LA32-NEXT:    sltu $a6, $a6, $t0
+; LA32-NEXT:    sltu $t0, $zero, $a7
+; LA32-NEXT:    add.w $t0, $a3, $t0
+; LA32-NEXT:    add.w $a6, $t0, $a6
+; LA32-NEXT:    sub.w $a6, $zero, $a6
+; LA32-NEXT:    sub.w $t0, $zero, $a5
+; LA32-NEXT:    sltu $t5, $zero, $a5
+; LA32-NEXT:    add.w $t5, $a1, $t5
+; LA32-NEXT:    sub.w $t5, $zero, $t5
+; LA32-NEXT:    bltz $a3, .LBB1_28
+; LA32-NEXT:  .LBB1_27: # %overflow.no.lhs.only
+; LA32-NEXT:    move $a6, $a3
+; LA32-NEXT:    move $a0, $a7
+; LA32-NEXT:  .LBB1_28: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_30
+; LA32-NEXT:  # %bb.29: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t5, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_31
+; LA32-NEXT:    b .LBB1_32
+; LA32-NEXT:  .LBB1_30: # %overflow.no.lhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_32
+; LA32-NEXT:  .LBB1_31: # %overflow.no.lhs.only
+; LA32-NEXT:    move $t0, $a5
+; LA32-NEXT:  .LBB1_32: # %overflow.no.lhs.only
+; LA32-NEXT:    slti $a1, $a4, 0
+; LA32-NEXT:    slti $a3, $a3, 0
+; LA32-NEXT:    mulh.wu $a4, $t2, $t0
+; LA32-NEXT:    mul.w $a5, $t4, $t0
+; LA32-NEXT:    add.w $a4, $a5, $a4
+; LA32-NEXT:    sltu $a5, $a4, $a5
+; LA32-NEXT:    mulh.wu $a7, $t4, $t0
+; LA32-NEXT:    add.w $a5, $a7, $a5
+; LA32-NEXT:    mul.w $a7, $t2, $t5
+; LA32-NEXT:    add.w $a4, $a7, $a4
+; LA32-NEXT:    sltu $a7, $a4, $a7
+; LA32-NEXT:    mulh.wu $t6, $t2, $t5
+; LA32-NEXT:    add.w $a7, $t6, $a7
+; LA32-NEXT:    add.w $a7, $a5, $a7
+; LA32-NEXT:    mul.w $t6, $t4, $t5
+; LA32-NEXT:    add.w $t7, $t6, $a7
+; LA32-NEXT:    mul.w $t8, $t0, $t1
+; LA32-NEXT:    add.w $t8, $t7, $t8
+; LA32-NEXT:    sltu $fp, $t8, $t7
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    sltu $a5, $a7, $a5
+; LA32-NEXT:    mulh.wu $a7, $t4, $t5
+; LA32-NEXT:    add.w $a5, $a7, $a5
+; LA32-NEXT:    add.w $a5, $a5, $t6
+; LA32-NEXT:    mulh.wu $a7, $t0, $t1
+; LA32-NEXT:    mul.w $t6, $t0, $t3
+; LA32-NEXT:    add.w $a7, $a7, $t6
+; LA32-NEXT:    mul.w $t5, $t5, $t1
+; LA32-NEXT:    add.w $a7, $a7, $t5
+; LA32-NEXT:    add.w $a5, $a5, $a7
+; LA32-NEXT:    add.w $a7, $a5, $fp
+; LA32-NEXT:    mul.w $a5, $t2, $t0
+; LA32-NEXT:    mulh.wu $t0, $t2, $a0
+; LA32-NEXT:    mul.w $t5, $t4, $a0
+; LA32-NEXT:    add.w $t0, $t5, $t0
+; LA32-NEXT:    sltu $t5, $t0, $t5
+; LA32-NEXT:    mulh.wu $t6, $t4, $a0
+; LA32-NEXT:    add.w $t5, $t6, $t5
+; LA32-NEXT:    mul.w $t6, $t2, $a6
+; LA32-NEXT:    add.w $t7, $t6, $t0
+; LA32-NEXT:    sltu $t0, $t7, $t6
+; LA32-NEXT:    mulh.wu $t6, $t2, $a6
+; LA32-NEXT:    add.w $t0, $t6, $t0
+; LA32-NEXT:    add.w $t6, $t5, $t0
+; LA32-NEXT:    mul.w $fp, $t4, $a6
+; LA32-NEXT:    add.w $s0, $fp, $t6
+; LA32-NEXT:    mul.w $t0, $a0, $t1
+; LA32-NEXT:    add.w $t0, $s0, $t0
+; LA32-NEXT:    sltu $s1, $t0, $s0
+; LA32-NEXT:    sltu $fp, $s0, $fp
+; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    mulh.wu $t4, $t4, $a6
+; LA32-NEXT:    add.w $t4, $t4, $t5
+; LA32-NEXT:    add.w $t4, $t4, $fp
+; LA32-NEXT:    mulh.wu $t5, $a0, $t1
+; LA32-NEXT:    mul.w $t3, $a0, $t3
+; LA32-NEXT:    add.w $t3, $t5, $t3
+; LA32-NEXT:    mul.w $a6, $a6, $t1
+; LA32-NEXT:    add.w $a6, $t3, $a6
+; LA32-NEXT:    add.w $t3, $t4, $a6
+; LA32-NEXT:    mul.w $a0, $t2, $a0
+; LA32-NEXT:    add.w $t2, $a7, $t7
+; LA32-NEXT:    add.w $a6, $t8, $a0
+; LA32-NEXT:    sltu $t1, $a6, $t8
+; LA32-NEXT:    add.w $t2, $t2, $t1
+; LA32-NEXT:    add.w $a0, $t3, $s1
+; LA32-NEXT:    beq $t2, $a7, .LBB1_34
+; LA32-NEXT:  # %bb.33: # %overflow.no.lhs.only
+; LA32-NEXT:    sltu $t1, $t2, $a7
+; LA32-NEXT:  .LBB1_34: # %overflow.no.lhs.only
+; LA32-NEXT:    add.w $a7, $t0, $t1
+; LA32-NEXT:    sltu $t0, $a7, $t0
+; LA32-NEXT:    add.w $t0, $a0, $t0
+; LA32-NEXT:    xor $a1, $a3, $a1
+; LA32-NEXT:    sub.w $a3, $zero, $a1
+; LA32-NEXT:    xor $a4, $a4, $a3
+; LA32-NEXT:    xor $a5, $a5, $a3
+; LA32-NEXT:    add.w $a0, $a5, $a1
+; LA32-NEXT:    sltu $a5, $a0, $a5
+; LA32-NEXT:    add.w $t1, $a4, $a5
+; LA32-NEXT:    sltui $a4, $t1, 1
+; LA32-NEXT:    sltu $a1, $a0, $a1
+; LA32-NEXT:    and $a4, $a4, $a1
+; LA32-NEXT:    xor $a1, $t2, $a3
+; LA32-NEXT:    xor $a5, $a6, $a3
+; LA32-NEXT:    add.w $t2, $a5, $a4
+; LA32-NEXT:    sltu $a5, $t2, $a5
+; LA32-NEXT:    add.w $a1, $a1, $a5
+; LA32-NEXT:    sltui $a5, $a1, 1
+; LA32-NEXT:    sltu $a4, $t2, $a4
+; LA32-NEXT:    and $a4, $a5, $a4
+; LA32-NEXT:    xor $a5, $t0, $a3
+; LA32-NEXT:    xor $a3, $a7, $a3
+; LA32-NEXT:    add.w $a4, $a3, $a4
+; LA32-NEXT:    sltu $a3, $a4, $a3
+; LA32-NEXT:    add.w $a3, $a5, $a3
+; LA32-NEXT:    or $a3, $a4, $a3
+; LA32-NEXT:    b .LBB1_52
+; LA32-NEXT:  .LBB1_35:
+; LA32-NEXT:    sub.w $t2, $zero, $a7
+; LA32-NEXT:    or $t1, $a5, $a1
+; LA32-NEXT:    sltu $t3, $zero, $t1
+; LA32-NEXT:    sub.w $t1, $t2, $t3
+; LA32-NEXT:    sltu $t2, $t2, $t3
+; LA32-NEXT:    sltu $t3, $zero, $a7
+; LA32-NEXT:    add.w $t3, $a3, $t3
+; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    sub.w $t3, $zero, $t2
+; LA32-NEXT:    sub.w $t2, $zero, $a5
+; LA32-NEXT:    sltu $t4, $zero, $a5
+; LA32-NEXT:    add.w $t4, $a1, $t4
+; LA32-NEXT:    sub.w $t4, $zero, $t4
+; LA32-NEXT:    bltz $a3, .LBB1_37
+; LA32-NEXT:  .LBB1_36: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t3, $a3
+; LA32-NEXT:    move $t1, $a7
+; LA32-NEXT:  .LBB1_37: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_41
+; LA32-NEXT:  # %bb.38: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t4, $a1
+; LA32-NEXT:    bgez $a3, .LBB1_42
+; LA32-NEXT:  .LBB1_39: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_43
+; LA32-NEXT:  .LBB1_40: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a1, $a0
+; LA32-NEXT:    move $a5, $a4
+; LA32-NEXT:    move $a7, $a6
+; LA32-NEXT:    move $t5, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_44
+; LA32-NEXT:    b .LBB1_45
+; LA32-NEXT:  .LBB1_41: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a3, .LBB1_39
+; LA32-NEXT:  .LBB1_42: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t2, $a5
+; LA32-NEXT:    bgez $a4, .LBB1_40
+; LA32-NEXT:  .LBB1_43:
+; LA32-NEXT:    sub.w $a5, $zero, $a0
+; LA32-NEXT:    or $a1, $a6, $t0
+; LA32-NEXT:    sltu $a7, $zero, $a1
+; LA32-NEXT:    sub.w $a1, $a5, $a7
+; LA32-NEXT:    sltu $a5, $a5, $a7
+; LA32-NEXT:    sltu $a7, $zero, $a0
+; LA32-NEXT:    add.w $a7, $a4, $a7
+; LA32-NEXT:    add.w $a5, $a7, $a5
+; LA32-NEXT:    sub.w $a5, $zero, $a5
+; LA32-NEXT:    sub.w $a7, $zero, $a6
+; LA32-NEXT:    sltu $t5, $zero, $a6
+; LA32-NEXT:    add.w $t5, $t0, $t5
+; LA32-NEXT:    sub.w $t5, $zero, $t5
+; LA32-NEXT:    bltz $a4, .LBB1_45
+; LA32-NEXT:  .LBB1_44: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a5, $a4
+; LA32-NEXT:    move $a1, $a0
+; LA32-NEXT:  .LBB1_45: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_47
+; LA32-NEXT:  # %bb.46: # %overflow.no.rhs.only
+; LA32-NEXT:    move $t5, $t0
+; LA32-NEXT:    bgez $a4, .LBB1_48
+; LA32-NEXT:    b .LBB1_49
+; LA32-NEXT:  .LBB1_47: # %overflow.no.rhs.only
+; LA32-NEXT:    bltz $a4, .LBB1_49
+; LA32-NEXT:  .LBB1_48: # %overflow.no.rhs.only
+; LA32-NEXT:    move $a7, $a6
+; LA32-NEXT:  .LBB1_49: # %overflow.no.rhs.only
+; LA32-NEXT:    slti $a0, $a3, 0
+; LA32-NEXT:    slti $a3, $a4, 0
+; LA32-NEXT:    mulh.wu $a4, $t2, $a7
+; LA32-NEXT:    mul.w $a6, $t4, $a7
+; LA32-NEXT:    add.w $a4, $a6, $a4
+; LA32-NEXT:    sltu $a6, $a4, $a6
+; LA32-NEXT:    mulh.wu $t0, $t4, $a7
+; LA32-NEXT:    add.w $a6, $t0, $a6
+; LA32-NEXT:    mul.w $t0, $t2, $t5
+; LA32-NEXT:    add.w $a4, $t0, $a4
+; LA32-NEXT:    sltu $t0, $a4, $t0
+; LA32-NEXT:    mulh.wu $t6, $t2, $t5
+; LA32-NEXT:    add.w $t0, $t6, $t0
+; LA32-NEXT:    add.w $t0, $a6, $t0
+; LA32-NEXT:    mul.w $t6, $t4, $t5
+; LA32-NEXT:    add.w $t7, $t6, $t0
+; LA32-NEXT:    mul.w $t8, $a7, $t1
+; LA32-NEXT:    add.w $t8, $t7, $t8
+; LA32-NEXT:    sltu $fp, $t8, $t7
+; LA32-NEXT:    sltu $t6, $t7, $t6
+; LA32-NEXT:    sltu $a6, $t0, $a6
+; LA32-NEXT:    mulh.wu $t0, $t4, $t5
+; LA32-NEXT:    add.w $a6, $t0, $a6
+; LA32-NEXT:    add.w $a6, $a6, $t6
+; LA32-NEXT:    mulh.wu $t0, $a7, $t1
+; LA32-NEXT:    mul.w $t6, $a7, $t3
+; LA32-NEXT:    add.w $t0, $t0, $t6
+; LA32-NEXT:    mul.w $t5, $t5, $t1
+; LA32-NEXT:    add.w $t0, $t0, $t5
+; LA32-NEXT:    add.w $a6, $a6, $t0
+; LA32-NEXT:    add.w $t0, $a6, $fp
+; LA32-NEXT:    mul.w $a6, $t2, $a7
+; LA32-NEXT:    mulh.wu $a7, $t2, $a1
+; LA32-NEXT:    mul.w $t5, $t4, $a1
+; LA32-NEXT:    add.w $a7, $t5, $a7
+; LA32-NEXT:    sltu $t5, $a7, $t5
+; LA32-NEXT:    mulh.wu $t6, $t4, $a1
+; LA32-NEXT:    add.w $t5, $t6, $t5
+; LA32-NEXT:    mul.w $t6, $t2, $a5
+; LA32-NEXT:    add.w $t7, $t6, $a7
+; LA32-NEXT:    sltu $a7, $t7, $t6
+; LA32-NEXT:    mulh.wu $t6, $t2, $a5
+; LA32-NEXT:    add.w $a7, $t6, $a7
+; LA32-NEXT:    add.w $t6, $t5, $a7
+; LA32-NEXT:    mul.w $fp, $t4, $a5
+; LA32-NEXT:    add.w $s0, $fp, $t6
+; LA32-NEXT:    mul.w $a7, $a1, $t1
+; LA32-NEXT:    add.w $a7, $s0, $a7
+; LA32-NEXT:    sltu $s1, $a7, $s0
+; LA32-NEXT:    sltu $fp, $s0, $fp
+; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    mulh.wu $t4, $t4, $a5
+; LA32-NEXT:    add.w $t4, $t4, $t5
+; LA32-NEXT:    add.w $t4, $t4, $fp
+; LA32-NEXT:    mulh.wu $t5, $a1, $t1
+; LA32-NEXT:    mul.w $t3, $a1, $t3
+; LA32-NEXT:    add.w $t3, $t5, $t3
+; LA32-NEXT:    mul.w $a5, $a5, $t1
+; LA32-NEXT:    add.w $a5, $t3, $a5
+; LA32-NEXT:    add.w $t1, $t4, $a5
+; LA32-NEXT:    mul.w $a1, $t2, $a1
+; LA32-NEXT:    add.w $a5, $t0, $t7
+; LA32-NEXT:    add.w $a1, $t8, $a1
+; LA32-NEXT:    sltu $t2, $a1, $t8
+; LA32-NEXT:    add.w $a5, $a5, $t2
+; LA32-NEXT:    add.w $t1, $t1, $s1
+; LA32-NEXT:    beq $a5, $t0, .LBB1_51
+; LA32-NEXT:  # %bb.50: # %overflow.no.rhs.only
+; LA32-NEXT:    sltu $t2, $a5, $t0
+; LA32-NEXT:  .LBB1_51: # %overflow.no.rhs.only
+; LA32-NEXT:    add.w $t0, $a7, $t2
+; LA32-NEXT:    sltu $a7, $t0, $a7
+; LA32-NEXT:    add.w $a7, $t1, $a7
+; LA32-NEXT:    xor $a3, $a0, $a3
+; LA32-NEXT:    sub.w $t3, $zero, $a3
+; LA32-NEXT:    xor $a4, $a4, $t3
+; LA32-NEXT:    xor $a6, $a6, $t3
+; LA32-NEXT:    add.w $a0, $a6, $a3
+; LA32-NEXT:    sltu $a6, $a0, $a6
+; LA32-NEXT:    add.w $t1, $a4, $a6
+; LA32-NEXT:    sltui $a4, $t1, 1
+; LA32-NEXT:    sltu $a3, $a0, $a3
+; LA32-NEXT:    and $a3, $a4, $a3
+; LA32-NEXT:    xor $a4, $a5, $t3
+; LA32-NEXT:    xor $a1, $a1, $t3
+; LA32-NEXT:    add.w $t2, $a1, $a3
+; LA32-NEXT:    sltu $a1, $t2, $a1
+; LA32-NEXT:    add.w $a1, $a4, $a1
+; LA32-NEXT:    sltui $a4, $a1, 1
+; LA32-NEXT:    sltu $a3, $t2, $a3
+; LA32-NEXT:    and $a3, $a4, $a3
+; LA32-NEXT:    xor $a4, $a7, $t3
+; LA32-NEXT:    xor $a5, $t0, $t3
+; LA32-NEXT:    add.w $a3, $a5, $a3
+; LA32-NEXT:    sltu $a5, $a3, $a5
+; LA32-NEXT:    add.w $a4, $a4, $a5
+; LA32-NEXT:    or $a3, $a3, $a4
+; LA32-NEXT:  .LBB1_52: # %overflow.res
+; LA32-NEXT:    sltu $t3, $zero, $a3
+; LA32-NEXT:  .LBB1_53: # %overflow.res
+; LA32-NEXT:    st.w $a0, $a2, 0
+; LA32-NEXT:    st.w $t1, $a2, 4
+; LA32-NEXT:    st.w $t2, $a2, 8
+; LA32-NEXT:    andi $a0, $t3, 1
+; LA32-NEXT:    st.w $a1, $a2, 12
 ; LA32-NEXT:    ld.w $s8, $sp, 4 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s7, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
@@ -295,7 +838,13 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi128:
-; LA64:       # %bb.0:
+; LA64:       # %bb.0: # %overflow.entry
+; LA64-NEXT:    srai.d $a6, $a0, 63
+; LA64-NEXT:    srai.d $a5, $a2, 63
+; LA64-NEXT:    beq $a1, $a6, .LBB1_3
+; LA64-NEXT:  # %bb.1: # %overflow.lhs
+; LA64-NEXT:    beq $a3, $a5, .LBB1_5
+; LA64-NEXT:  # %bb.2: # %overflow
 ; LA64-NEXT:    mulh.du $a5, $a0, $a2
 ; LA64-NEXT:    mul.d $a6, $a1, $a2
 ; LA64-NEXT:    add.d $a5, $a6, $a5
@@ -329,11 +878,129 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA64-NEXT:    xor $a1, $a1, $a6
 ; LA64-NEXT:    xor $a3, $a3, $a6
 ; LA64-NEXT:    or $a1, $a3, $a1
-; LA64-NEXT:    sltu $a1, $zero, $a1
+; LA64-NEXT:    sltu $a6, $zero, $a1
+; LA64-NEXT:    b .LBB1_8
+; LA64-NEXT:  .LBB1_3: # %overflow.no.lhs
+; LA64-NEXT:    beq $a3, $a5, .LBB1_7
+; LA64-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; LA64-NEXT:    slti $a5, $a1, 0
+; LA64-NEXT:    masknez $a6, $a0, $a5
+; LA64-NEXT:    sub.d $a7, $zero, $a0
+; LA64-NEXT:    maskeqz $a7, $a7, $a5
+; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    masknez $t0, $a1, $a5
+; LA64-NEXT:    sltu $a0, $zero, $a0
+; LA64-NEXT:    add.d $a0, $a1, $a0
+; LA64-NEXT:    sub.d $a0, $zero, $a0
+; LA64-NEXT:    maskeqz $a0, $a0, $a5
+; LA64-NEXT:    or $a0, $a0, $t0
+; LA64-NEXT:    maskeqz $a0, $a0, $a5
+; LA64-NEXT:    or $a0, $a0, $t0
+; LA64-NEXT:    maskeqz $a1, $a7, $a5
+; LA64-NEXT:    or $a1, $a1, $a6
+; LA64-NEXT:    slti $a6, $a3, 0
+; LA64-NEXT:    masknez $a7, $a2, $a6
+; LA64-NEXT:    sub.d $t0, $zero, $a2
+; LA64-NEXT:    maskeqz $t0, $t0, $a6
+; LA64-NEXT:    or $t0, $t0, $a7
+; LA64-NEXT:    masknez $t1, $a3, $a6
+; LA64-NEXT:    sltu $a2, $zero, $a2
+; LA64-NEXT:    add.d $a2, $a3, $a2
+; LA64-NEXT:    sub.d $a2, $zero, $a2
+; LA64-NEXT:    maskeqz $a2, $a2, $a6
+; LA64-NEXT:    or $a2, $a2, $t1
+; LA64-NEXT:    maskeqz $a2, $a2, $a6
+; LA64-NEXT:    or $a2, $a2, $t1
+; LA64-NEXT:    maskeqz $a3, $t0, $a6
+; LA64-NEXT:    or $a3, $a3, $a7
+; LA64-NEXT:    mulh.du $a7, $a1, $a3
+; LA64-NEXT:    mul.d $t0, $a0, $a3
+; LA64-NEXT:    add.d $a7, $a7, $t0
+; LA64-NEXT:    mul.d $a3, $a1, $a3
+; LA64-NEXT:    mul.d $a0, $a0, $a2
+; LA64-NEXT:    mulh.du $t0, $a1, $a2
+; LA64-NEXT:    add.d $a0, $t0, $a0
+; LA64-NEXT:    mul.d $a1, $a1, $a2
+; LA64-NEXT:    add.d $a1, $a7, $a1
+; LA64-NEXT:    sltu $a2, $a1, $a7
+; LA64-NEXT:    add.d $a2, $a0, $a2
+; LA64-NEXT:    xor $a5, $a6, $a5
+; LA64-NEXT:    sub.d $a6, $zero, $a5
+; LA64-NEXT:    xor $a0, $a3, $a6
+; LA64-NEXT:    add.d $a0, $a0, $a5
+; LA64-NEXT:    sltu $a3, $a0, $a5
+; LA64-NEXT:    xor $a1, $a1, $a6
+; LA64-NEXT:    add.d $a5, $a1, $a3
+; LA64-NEXT:    sltu $a1, $a5, $a3
+; LA64-NEXT:    b .LBB1_6
+; LA64-NEXT:  .LBB1_5: # %overflow.no.rhs.only
+; LA64-NEXT:    slti $a5, $a3, 0
+; LA64-NEXT:    masknez $a6, $a2, $a5
+; LA64-NEXT:    sub.d $a7, $zero, $a2
+; LA64-NEXT:    maskeqz $a7, $a7, $a5
+; LA64-NEXT:    or $a7, $a7, $a6
+; LA64-NEXT:    masknez $t0, $a3, $a5
+; LA64-NEXT:    sltu $a2, $zero, $a2
+; LA64-NEXT:    add.d $a2, $a3, $a2
+; LA64-NEXT:    sub.d $a2, $zero, $a2
+; LA64-NEXT:    maskeqz $a2, $a2, $a5
+; LA64-NEXT:    or $a2, $a2, $t0
+; LA64-NEXT:    maskeqz $a2, $a2, $a5
+; LA64-NEXT:    or $a2, $a2, $t0
+; LA64-NEXT:    maskeqz $a3, $a7, $a5
+; LA64-NEXT:    or $a3, $a3, $a6
+; LA64-NEXT:    slti $a6, $a1, 0
+; LA64-NEXT:    masknez $a7, $a0, $a6
+; LA64-NEXT:    sub.d $t0, $zero, $a0
+; LA64-NEXT:    maskeqz $t0, $t0, $a6
+; LA64-NEXT:    or $t0, $t0, $a7
+; LA64-NEXT:    masknez $t1, $a1, $a6
+; LA64-NEXT:    sltu $a0, $zero, $a0
+; LA64-NEXT:    add.d $a0, $a1, $a0
+; LA64-NEXT:    sub.d $a0, $zero, $a0
+; LA64-NEXT:    maskeqz $a0, $a0, $a6
+; LA64-NEXT:    or $a0, $a0, $t1
+; LA64-NEXT:    maskeqz $a0, $a0, $a6
+; LA64-NEXT:    or $a0, $a0, $t1
+; LA64-NEXT:    maskeqz $a1, $t0, $a6
+; LA64-NEXT:    or $a1, $a1, $a7
+; LA64-NEXT:    mulh.du $a7, $a3, $a1
+; LA64-NEXT:    mul.d $t0, $a2, $a1
+; LA64-NEXT:    add.d $a7, $a7, $t0
+; LA64-NEXT:    mul.d $a1, $a3, $a1
+; LA64-NEXT:    mul.d $a2, $a2, $a0
+; LA64-NEXT:    mulh.du $t0, $a3, $a0
+; LA64-NEXT:    add.d $a2, $t0, $a2
+; LA64-NEXT:    mul.d $a0, $a3, $a0
+; LA64-NEXT:    add.d $a3, $a7, $a0
+; LA64-NEXT:    sltu $a0, $a3, $a7
+; LA64-NEXT:    add.d $a2, $a2, $a0
+; LA64-NEXT:    xor $a5, $a5, $a6
+; LA64-NEXT:    sub.d $a6, $zero, $a5
+; LA64-NEXT:    xor $a0, $a1, $a6
+; LA64-NEXT:    add.d $a0, $a0, $a5
+; LA64-NEXT:    sltu $a1, $a0, $a5
+; LA64-NEXT:    xor $a3, $a3, $a6
+; LA64-NEXT:    add.d $a5, $a3, $a1
+; LA64-NEXT:    sltu $a1, $a5, $a1
+; LA64-NEXT:  .LBB1_6: # %overflow.res
+; LA64-NEXT:    xor $a2, $a2, $a6
+; LA64-NEXT:    add.d $a1, $a2, $a1
+; LA64-NEXT:    sltu $a6, $zero, $a1
+; LA64-NEXT:    b .LBB1_9
+; LA64-NEXT:  .LBB1_7: # %overflow.no
+; LA64-NEXT:    move $a6, $zero
+; LA64-NEXT:    mulh.du $a5, $a0, $a2
+; LA64-NEXT:    mul.d $a3, $a0, $a3
+; LA64-NEXT:    add.d $a3, $a5, $a3
+; LA64-NEXT:    mul.d $a1, $a1, $a2
+; LA64-NEXT:    add.d $a5, $a3, $a1
+; LA64-NEXT:  .LBB1_8: # %overflow.res
 ; LA64-NEXT:    mul.d $a0, $a0, $a2
+; LA64-NEXT:  .LBB1_9: # %overflow.res
 ; LA64-NEXT:    st.d $a0, $a4, 0
+; LA64-NEXT:    andi $a0, $a6, 1
 ; LA64-NEXT:    st.d $a5, $a4, 8
-; LA64-NEXT:    move $a0, $a1
 ; LA64-NEXT:    ret
   %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
index f573fdab1b153..5bebf54c3c1a0 100644
--- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
@@ -4,136 +4,343 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; PPC64-LABEL: muloti_test:
-; PPC64:       # %bb.0: # %start
-; PPC64-NEXT:    addic 9, 5, -1
-; PPC64-NEXT:    mulld 10, 5, 4
-; PPC64-NEXT:    mulld 11, 3, 6
-; PPC64-NEXT:    subfe 9, 9, 5
-; PPC64-NEXT:    add 10, 11, 10
-; PPC64-NEXT:    addic 11, 3, -1
-; PPC64-NEXT:    mulhdu 8, 3, 6
-; PPC64-NEXT:    subfe 3, 11, 3
-; PPC64-NEXT:    and 3, 3, 9
-; PPC64-NEXT:    addic 9, 8, -1
-; PPC64-NEXT:    subfe 8, 9, 8
-; PPC64-NEXT:    or 3, 3, 8
-; PPC64-NEXT:    mulhdu 5, 5, 4
-; PPC64-NEXT:    addic 8, 5, -1
-; PPC64-NEXT:    subfe 5, 8, 5
-; PPC64-NEXT:    li 7, 0
-; PPC64-NEXT:    or 5, 3, 5
-; PPC64-NEXT:    mulhdu 8, 4, 6
-; PPC64-NEXT:    addc 3, 8, 10
-; PPC64-NEXT:    addze 7, 7
-; PPC64-NEXT:    addic 8, 7, -1
-; PPC64-NEXT:    subfe 7, 8, 7
-; PPC64-NEXT:    or 5, 5, 7
+; PPC64:       # %bb.0: # %overflow.entry
+; PPC64-NEXT:    cmpldi 3, 0
+; PPC64-NEXT:    beq 0, .LBB0_3
+; PPC64-NEXT:  # %bb.1: # %overflow.lhs
+; PPC64-NEXT:    cmpldi 5, 0
+; PPC64-NEXT:    beq 0, .LBB0_5
+; PPC64-NEXT:  # %bb.2: # %overflow
+; PPC64-NEXT:    mulhdu. 7, 3, 6
+; PPC64-NEXT:    mcrf 5, 0
+; PPC64-NEXT:    cmpdi 6, 5, 0
+; PPC64-NEXT:    mulhdu. 7, 5, 4
+; PPC64-NEXT:    mcrf 1, 0
+; PPC64-NEXT:    cmpdi 3, 0
+; PPC64-NEXT:    mulld 5, 5, 4
+; PPC64-NEXT:    mulld 3, 3, 6
+; PPC64-NEXT:    crnor 20, 26, 2
+; PPC64-NEXT:    add 3, 3, 5
+; PPC64-NEXT:    crorc 20, 20, 22
+; PPC64-NEXT:    mulhdu 7, 4, 6
+; PPC64-NEXT:    addc 3, 7, 3
+; PPC64-NEXT:    li 5, 0
+; PPC64-NEXT:    addze. 5, 5
+; PPC64-NEXT:    crorc 20, 20, 6
+; PPC64-NEXT:    crorc 20, 20, 2
 ; PPC64-NEXT:    mulld 4, 4, 6
+; PPC64-NEXT:    b .LBB0_7
+; PPC64-NEXT:  .LBB0_3: # %overflow.no.lhs
+; PPC64-NEXT:    cmpldi 5, 0
+; PPC64-NEXT:    beq 0, .LBB0_6
+; PPC64-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; PPC64-NEXT:    mulhdu 7, 4, 6
+; PPC64-NEXT:    mulld 8, 3, 6
+; PPC64-NEXT:    mulld 9, 3, 5
+; PPC64-NEXT:    add 3, 7, 8
+; PPC64-NEXT:    mulhdu 7, 4, 5
+; PPC64-NEXT:    mulld 5, 4, 5
+; PPC64-NEXT:    mulld 4, 4, 6
+; PPC64-NEXT:    addc 3, 3, 5
+; PPC64-NEXT:    adde. 5, 7, 9
+; PPC64-NEXT:    crnot 20, 2
+; PPC64-NEXT:    b .LBB0_7
+; PPC64-NEXT:  .LBB0_5: # %overflow.no.rhs.only
+; PPC64-NEXT:    mulhdu 7, 6, 4
+; PPC64-NEXT:    mulld 8, 5, 4
+; PPC64-NEXT:    mulld 5, 5, 3
+; PPC64-NEXT:    mulld 4, 6, 4
+; PPC64-NEXT:    add 7, 7, 8
+; PPC64-NEXT:    mulhdu 8, 6, 3
+; PPC64-NEXT:    mulld 3, 6, 3
+; PPC64-NEXT:    addc 3, 7, 3
+; PPC64-NEXT:    adde. 5, 8, 5
+; PPC64-NEXT:    crnot 20, 2
+; PPC64-NEXT:    b .LBB0_7
+; PPC64-NEXT:  .LBB0_6: # %overflow.no
+; PPC64-NEXT:    mulld 5, 4, 5
+; PPC64-NEXT:    mulhdu 7, 4, 6
+; PPC64-NEXT:    mulld 3, 3, 6
+; PPC64-NEXT:    add 5, 7, 5
+; PPC64-NEXT:    mulld 4, 4, 6
+; PPC64-NEXT:    add 3, 5, 3
+; PPC64-NEXT:    crxor 20, 20, 20
+; PPC64-NEXT:  .LBB0_7: # %overflow.res
+; PPC64-NEXT:    li 5, 1
+; PPC64-NEXT:    bclr 12, 20, 0
+; PPC64-NEXT:  # %bb.8: # %overflow.res
+; PPC64-NEXT:    li 5, 0
 ; PPC64-NEXT:    blr
 ;
 ; PPC32-LABEL: muloti_test:
-; PPC32:       # %bb.0: # %start
-; PPC32-NEXT:    stwu 1, -64(1)
-; PPC32-NEXT:    stw 26, 40(1) # 4-byte Folded Spill
+; PPC32:       # %bb.0: # %overflow.entry
+; PPC32-NEXT:    stwu 1, -80(1)
+; PPC32-NEXT:    stw 30, 72(1) # 4-byte Folded Spill
 ; PPC32-NEXT:    mfcr 12
-; PPC32-NEXT:    stw 27, 44(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mullw 27, 9, 4
-; PPC32-NEXT:    stw 21, 20(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mr 11, 7
-; PPC32-NEXT:    stw 22, 24(1) # 4-byte Folded Spill
-; PPC32-NEXT:    li 7, 0
-; PPC32-NEXT:    mullw 26, 3, 10
-; PPC32-NEXT:    stw 23, 28(1) # 4-byte Folded Spill
-; PPC32-NEXT:    add 27, 26, 27
-; PPC32-NEXT:    stw 24, 32(1) # 4-byte Folded Spill
-; PPC32-NEXT:    cmpwi 7, 11, 0
-; PPC32-NEXT:    stw 25, 36(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mullw 24, 11, 6
-; PPC32-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
-; PPC32-NEXT:    mulhwu 0, 8, 6
-; PPC32-NEXT:    stw 12, 16(1)
-; PPC32-NEXT:    mr 12, 5
-; PPC32-NEXT:    mulhwu 5, 4, 10
-; PPC32-NEXT:    addc 5, 5, 27
-; PPC32-NEXT:    addze 27, 7
-; PPC32-NEXT:    cmpwi 2, 27, 0
-; PPC32-NEXT:    mullw 25, 12, 8
-; PPC32-NEXT:    add 26, 24, 25
-; PPC32-NEXT:    addc 0, 0, 26
-; PPC32-NEXT:    addze 26, 7
-; PPC32-NEXT:    mullw 23, 8, 6
-; PPC32-NEXT:    mullw 22, 4, 10
-; PPC32-NEXT:    addc 24, 22, 23
-; PPC32-NEXT:    adde 22, 5, 0
-; PPC32-NEXT:    mulhwu 29, 6, 10
-; PPC32-NEXT:    mullw 21, 12, 10
-; PPC32-NEXT:    addc 5, 21, 29
-; PPC32-NEXT:    mulhwu 30, 12, 10
-; PPC32-NEXT:    addze 0, 30
-; PPC32-NEXT:    mullw 23, 6, 9
-; PPC32-NEXT:    addc 5, 23, 5
-; PPC32-NEXT:    mulhwu 28, 6, 9
-; PPC32-NEXT:    addze 29, 28
-; PPC32-NEXT:    addc 0, 0, 29
-; PPC32-NEXT:    addze 29, 7
-; PPC32-NEXT:    mullw 30, 12, 9
-; PPC32-NEXT:    addc 0, 30, 0
-; PPC32-NEXT:    mulhwu 25, 12, 9
-; PPC32-NEXT:    adde 30, 25, 29
-; PPC32-NEXT:    addc 0, 0, 24
-; PPC32-NEXT:    adde 30, 30, 22
-; PPC32-NEXT:    addze. 29, 7
+; PPC32-NEXT:    or. 30, 4, 3
+; PPC32-NEXT:    stw 18, 24(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 19, 28(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 20, 32(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 21, 36(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 22, 40(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 23, 44(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 24, 48(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 25, 52(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 26, 56(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 27, 60(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 28, 64(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 29, 68(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 12, 20(1)
+; PPC32-NEXT:    beq 0, .LBB0_3
+; PPC32-NEXT:  # %bb.1: # %overflow.lhs
+; PPC32-NEXT:    or. 29, 8, 7
+; PPC32-NEXT:    beq 0, .LBB0_5
+; PPC32-NEXT:  # %bb.2: # %overflow
+; PPC32-NEXT:    mullw 28, 9, 4
+; PPC32-NEXT:    li 19, 0
+; PPC32-NEXT:    cmpwi 2, 7, 0
+; PPC32-NEXT:    cmpwi 3, 5, 0
+; PPC32-NEXT:    cmpwi 7, 3, 0
+; PPC32-NEXT:    mullw 27, 3, 10
+; PPC32-NEXT:    add 28, 27, 28
+; PPC32-NEXT:    mulhwu 11, 4, 10
+; PPC32-NEXT:    addc 11, 11, 28
+; PPC32-NEXT:    addze 28, 19
+; PPC32-NEXT:    mullw 24, 5, 8
+; PPC32-NEXT:    mullw 23, 7, 6
+; PPC32-NEXT:    add 27, 23, 24
+; PPC32-NEXT:    mulhwu 12, 8, 6
+; PPC32-NEXT:    addc 12, 12, 27
+; PPC32-NEXT:    addze 27, 19
+; PPC32-NEXT:    mullw 22, 8, 6
+; PPC32-NEXT:    mullw 21, 4, 10
+; PPC32-NEXT:    addc 23, 21, 22
+; PPC32-NEXT:    adde 21, 11, 12
+; PPC32-NEXT:    mulhwu 26, 6, 10
+; PPC32-NEXT:    mullw 20, 5, 10
+; PPC32-NEXT:    addc 11, 20, 26
+; PPC32-NEXT:    mulhwu 0, 5, 10
+; PPC32-NEXT:    addze 12, 0
+; PPC32-NEXT:    mullw 22, 6, 9
+; PPC32-NEXT:    addc 11, 22, 11
+; PPC32-NEXT:    mulhwu 25, 6, 9
+; PPC32-NEXT:    addze 26, 25
+; PPC32-NEXT:    addc 12, 12, 26
+; PPC32-NEXT:    addze 26, 19
+; PPC32-NEXT:    mullw 0, 5, 9
+; PPC32-NEXT:    addc 12, 0, 12
+; PPC32-NEXT:    mulhwu 24, 5, 9
+; PPC32-NEXT:    adde 0, 24, 26
+; PPC32-NEXT:    addc 12, 12, 23
+; PPC32-NEXT:    adde 0, 0, 21
+; PPC32-NEXT:    addze. 26, 19
 ; PPC32-NEXT:    mcrf 1, 0
-; PPC32-NEXT:    mulhwu. 29, 11, 6
-; PPC32-NEXT:    mcrf 6, 0
-; PPC32-NEXT:    mulhwu. 29, 12, 8
+; PPC32-NEXT:    mulhwu. 26, 7, 6
 ; PPC32-NEXT:    mcrf 5, 0
-; PPC32-NEXT:    cmpwi 12, 0
-; PPC32-NEXT:    crnor 20, 2, 30
-; PPC32-NEXT:    cmpwi 3, 0
-; PPC32-NEXT:    cmpwi 7, 9, 0
-; PPC32-NEXT:    crnor 24, 30, 2
-; PPC32-NEXT:    mulhwu. 12, 3, 10
-; PPC32-NEXT:    crorc 20, 20, 26
-; PPC32-NEXT:    mcrf 7, 0
+; PPC32-NEXT:    crnor 20, 14, 10
 ; PPC32-NEXT:    crorc 20, 20, 22
-; PPC32-NEXT:    cmpwi 26, 0
-; PPC32-NEXT:    crorc 28, 20, 2
-; PPC32-NEXT:    mulhwu. 9, 9, 4
-; PPC32-NEXT:    mcrf 5, 0
-; PPC32-NEXT:    crorc 20, 24, 30
-; PPC32-NEXT:    or. 3, 4, 3
+; PPC32-NEXT:    cmpwi 2, 30, 0
+; PPC32-NEXT:    cmpwi 3, 29, 0
+; PPC32-NEXT:    mulhwu. 5, 5, 8
 ; PPC32-NEXT:    mcrf 6, 0
-; PPC32-NEXT:    crorc 20, 20, 22
-; PPC32-NEXT:    or. 3, 8, 11
-; PPC32-NEXT:    crorc 20, 20, 10
-; PPC32-NEXT:    crnor 21, 2, 26
+; PPC32-NEXT:    cmpwi 9, 0
+; PPC32-NEXT:    crnor 21, 2, 30
+; PPC32-NEXT:    crorc 20, 20, 26
+; PPC32-NEXT:    crnor 23, 14, 10
+; PPC32-NEXT:    mulhwu. 3, 3, 10
+; PPC32-NEXT:    mcrf 7, 0
+; PPC32-NEXT:    cmpwi 27, 0
+; PPC32-NEXT:    crorc 20, 20, 2
+; PPC32-NEXT:    crorc 21, 21, 30
+; PPC32-NEXT:    mulhwu. 3, 9, 4
+; PPC32-NEXT:    crorc 21, 21, 2
+; PPC32-NEXT:    cmpwi 28, 0
+; PPC32-NEXT:    crorc 21, 21, 2
+; PPC32-NEXT:    cror 21, 23, 21
 ; PPC32-NEXT:    cror 20, 21, 20
-; PPC32-NEXT:    cror 20, 20, 28
-; PPC32-NEXT:    crandc 20, 6, 20
+; PPC32-NEXT:    crorc 20, 20, 6
 ; PPC32-NEXT:    mullw 6, 6, 10
-; PPC32-NEXT:    bc 12, 20, .LBB0_2
-; PPC32-NEXT:  # %bb.1: # %start
 ; PPC32-NEXT:    li 7, 1
-; PPC32-NEXT:  .LBB0_2: # %start
-; PPC32-NEXT:    lwz 12, 16(1)
-; PPC32-NEXT:    mr 3, 30
-; PPC32-NEXT:    mr 4, 0
-; PPC32-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
+; PPC32-NEXT:    bc 4, 20, .LBB0_7
+; PPC32-NEXT:    b .LBB0_8
+; PPC32-NEXT:  .LBB0_3: # %overflow.no.lhs
+; PPC32-NEXT:    or. 11, 8, 7
+; PPC32-NEXT:    beq 0, .LBB0_9
+; PPC32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; PPC32-NEXT:    mulhwu 29, 10, 4
+; PPC32-NEXT:    mullw 20, 10, 3
+; PPC32-NEXT:    add 29, 29, 20
+; PPC32-NEXT:    mulhwu 12, 6, 10
+; PPC32-NEXT:    mulhwu 0, 6, 9
+; PPC32-NEXT:    mulhwu 30, 5, 9
+; PPC32-NEXT:    mulhwu 24, 8, 4
+; PPC32-NEXT:    mullw 23, 5, 10
+; PPC32-NEXT:    addc 12, 23, 12
+; PPC32-NEXT:    mullw 22, 6, 9
+; PPC32-NEXT:    mullw 21, 5, 9
+; PPC32-NEXT:    mullw 9, 9, 4
+; PPC32-NEXT:    add 9, 29, 9
+; PPC32-NEXT:    mullw 3, 8, 3
+; PPC32-NEXT:    add 3, 24, 3
+; PPC32-NEXT:    mulhwu 11, 5, 10
+; PPC32-NEXT:    mullw 29, 7, 4
+; PPC32-NEXT:    add 3, 3, 29
+; PPC32-NEXT:    addze 29, 11
+; PPC32-NEXT:    addc 11, 22, 12
+; PPC32-NEXT:    addze 0, 0
+; PPC32-NEXT:    li 12, 0
+; PPC32-NEXT:    addc 0, 29, 0
+; PPC32-NEXT:    addze 29, 12
+; PPC32-NEXT:    addc 0, 21, 0
+; PPC32-NEXT:    mullw 19, 10, 4
+; PPC32-NEXT:    adde 30, 30, 29
+; PPC32-NEXT:    addc 0, 0, 19
+; PPC32-NEXT:    adde 9, 30, 9
+; PPC32-NEXT:    mulhwu 27, 6, 8
+; PPC32-NEXT:    mullw 18, 5, 8
+; PPC32-NEXT:    addc 30, 18, 27
+; PPC32-NEXT:    mulhwu 28, 5, 8
+; PPC32-NEXT:    addze 29, 28
+; PPC32-NEXT:    mulhwu 26, 6, 7
+; PPC32-NEXT:    mulhwu 25, 5, 7
+; PPC32-NEXT:    mullw 5, 5, 7
+; PPC32-NEXT:    mullw 7, 6, 7
+; PPC32-NEXT:    addc 7, 7, 30
+; PPC32-NEXT:    addze 30, 26
+; PPC32-NEXT:    addc 30, 29, 30
+; PPC32-NEXT:    addze 12, 12
+; PPC32-NEXT:    addc 5, 5, 30
+; PPC32-NEXT:    mullw 4, 8, 4
+; PPC32-NEXT:    adde 12, 25, 12
+; PPC32-NEXT:    addc 4, 5, 4
+; PPC32-NEXT:    adde 3, 12, 3
+; PPC32-NEXT:    mullw 5, 6, 8
+; PPC32-NEXT:    addc 12, 0, 5
+; PPC32-NEXT:    adde 0, 9, 7
+; PPC32-NEXT:    addze 4, 4
+; PPC32-NEXT:    addze 3, 3
+; PPC32-NEXT:    or. 3, 4, 3
+; PPC32-NEXT:    mullw 6, 6, 10
+; PPC32-NEXT:    b .LBB0_6
+; PPC32-NEXT:  .LBB0_5: # %overflow.no.rhs.only
+; PPC32-NEXT:    mulhwu 29, 6, 8
+; PPC32-NEXT:    mullw 20, 6, 7
+; PPC32-NEXT:    add 29, 29, 20
+; PPC32-NEXT:    mulhwu 12, 10, 6
+; PPC32-NEXT:    mulhwu 0, 10, 5
+; PPC32-NEXT:    mulhwu 30, 9, 5
+; PPC32-NEXT:    mulhwu 24, 4, 8
+; PPC32-NEXT:    mullw 23, 9, 6
+; PPC32-NEXT:    addc 12, 23, 12
+; PPC32-NEXT:    mullw 22, 10, 5
+; PPC32-NEXT:    mullw 21, 9, 5
+; PPC32-NEXT:    mullw 5, 5, 8
+; PPC32-NEXT:    add 5, 29, 5
+; PPC32-NEXT:    mullw 7, 4, 7
+; PPC32-NEXT:    add 7, 24, 7
+; PPC32-NEXT:    mulhwu 11, 9, 6
+; PPC32-NEXT:    mullw 29, 3, 8
+; PPC32-NEXT:    add 7, 7, 29
+; PPC32-NEXT:    addze 29, 11
+; PPC32-NEXT:    addc 11, 22, 12
+; PPC32-NEXT:    addze 0, 0
+; PPC32-NEXT:    li 12, 0
+; PPC32-NEXT:    addc 0, 29, 0
+; PPC32-NEXT:    addze 29, 12
+; PPC32-NEXT:    addc 0, 21, 0
+; PPC32-NEXT:    mullw 19, 6, 8
+; PPC32-NEXT:    adde 30, 30, 29
+; PPC32-NEXT:    addc 0, 0, 19
+; PPC32-NEXT:    adde 5, 30, 5
+; PPC32-NEXT:    mulhwu 27, 10, 4
+; PPC32-NEXT:    mullw 18, 9, 4
+; PPC32-NEXT:    addc 30, 18, 27
+; PPC32-NEXT:    mulhwu 28, 9, 4
+; PPC32-NEXT:    addze 29, 28
+; PPC32-NEXT:    mulhwu 26, 10, 3
+; PPC32-NEXT:    mulhwu 25, 9, 3
+; PPC32-NEXT:    mullw 9, 9, 3
+; PPC32-NEXT:    mullw 3, 10, 3
+; PPC32-NEXT:    addc 3, 3, 30
+; PPC32-NEXT:    addze 30, 26
+; PPC32-NEXT:    addc 30, 29, 30
+; PPC32-NEXT:    addze 12, 12
+; PPC32-NEXT:    addc 9, 9, 30
+; PPC32-NEXT:    mullw 8, 4, 8
+; PPC32-NEXT:    adde 12, 25, 12
+; PPC32-NEXT:    addc 8, 9, 8
+; PPC32-NEXT:    adde 7, 12, 7
+; PPC32-NEXT:    mullw 4, 10, 4
+; PPC32-NEXT:    addc 12, 0, 4
+; PPC32-NEXT:    adde 0, 5, 3
+; PPC32-NEXT:    addze 3, 8
+; PPC32-NEXT:    addze 4, 7
+; PPC32-NEXT:    or. 3, 3, 4
+; PPC32-NEXT:    mullw 6, 10, 6
+; PPC32-NEXT:  .LBB0_6: # %overflow.no.rhs.only
+; PPC32-NEXT:    crnot 20, 2
+; PPC32-NEXT:    li 7, 1
+; PPC32-NEXT:    bc 12, 20, .LBB0_8
+; PPC32-NEXT:  .LBB0_7: # %overflow.res
+; PPC32-NEXT:    li 7, 0
+; PPC32-NEXT:  .LBB0_8: # %overflow.res
+; PPC32-NEXT:    mr 4, 12
+; PPC32-NEXT:    lwz 12, 20(1)
+; PPC32-NEXT:    mr 3, 0
+; PPC32-NEXT:    mr 5, 11
+; PPC32-NEXT:    lwz 30, 72(1) # 4-byte Folded Reload
 ; PPC32-NEXT:    mtcrf 32, 12 # cr2
-; PPC32-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 27, 44(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 26, 40(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 25, 36(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 24, 32(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 23, 28(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 22, 24(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 21, 20(1) # 4-byte Folded Reload
-; PPC32-NEXT:    addi 1, 1, 64
+; PPC32-NEXT:    mtcrf 16, 12 # cr3
+; PPC32-NEXT:    lwz 29, 68(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 28, 64(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 27, 60(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 26, 56(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 25, 52(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 24, 48(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 23, 44(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 22, 40(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 21, 36(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 20, 32(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 19, 28(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 18, 24(1) # 4-byte Folded Reload
+; PPC32-NEXT:    addi 1, 1, 80
 ; PPC32-NEXT:    blr
+; PPC32-NEXT:  .LBB0_9: # %overflow.no
+; PPC32-NEXT:    mulhwu 11, 10, 4
+; PPC32-NEXT:    mulhwu 12, 8, 6
+; PPC32-NEXT:    mullw 3, 10, 3
+; PPC32-NEXT:    add 3, 11, 3
+; PPC32-NEXT:    mullw 26, 8, 5
+; PPC32-NEXT:    mulhwu 0, 5, 10
+; PPC32-NEXT:    mulhwu 30, 6, 10
+; PPC32-NEXT:    mulhwu 29, 6, 9
+; PPC32-NEXT:    mulhwu 28, 5, 9
+; PPC32-NEXT:    mullw 27, 9, 4
+; PPC32-NEXT:    add 3, 3, 27
+; PPC32-NEXT:    mullw 7, 7, 6
+; PPC32-NEXT:    mullw 4, 10, 4
+; PPC32-NEXT:    mullw 8, 8, 6
+; PPC32-NEXT:    addc 4, 8, 4
+; PPC32-NEXT:    li 8, 0
+; PPC32-NEXT:    mullw 25, 5, 10
+; PPC32-NEXT:    mullw 5, 5, 9
+; PPC32-NEXT:    mullw 9, 6, 9
+; PPC32-NEXT:    mullw 6, 6, 10
+; PPC32-NEXT:    add 10, 12, 26
+; PPC32-NEXT:    add 7, 10, 7
+; PPC32-NEXT:    adde 3, 7, 3
+; PPC32-NEXT:    addc 7, 25, 30
+; PPC32-NEXT:    addze 10, 0
+; PPC32-NEXT:    addc 11, 9, 7
+; PPC32-NEXT:    addze 7, 29
+; PPC32-NEXT:    addc 7, 10, 7
+; PPC32-NEXT:    addze 8, 8
+; PPC32-NEXT:    addc 5, 5, 7
+; PPC32-NEXT:    adde 7, 28, 8
+; PPC32-NEXT:    addc 12, 5, 4
+; PPC32-NEXT:    adde 0, 7, 3
+; PPC32-NEXT:    li 7, 1
+; PPC32-NEXT:    b .LBB0_7
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index d6fd4f15c4e53..4c9aeaa3ba5a1 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -3,7 +3,7 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-LABEL: muloti_test:
-; RISCV32:       # %bb.0: # %start
+; RISCV32:       # %bb.0: # %overflow.entry
 ; RISCV32-NEXT:    addi sp, sp, -32
 ; RISCV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
@@ -11,100 +11,301 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a4, 0(a1)
+; RISCV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
+; RISCV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
+; RISCV32-NEXT:    lw a3, 0(a1)
 ; RISCV32-NEXT:    lw t0, 4(a1)
-; RISCV32-NEXT:    lw a3, 8(a1)
-; RISCV32-NEXT:    lw a1, 12(a1)
-; RISCV32-NEXT:    lw a6, 0(a2)
-; RISCV32-NEXT:    lw a5, 4(a2)
-; RISCV32-NEXT:    lw a7, 8(a2)
+; RISCV32-NEXT:    lw a4, 8(a1)
+; RISCV32-NEXT:    lw a6, 12(a1)
+; RISCV32-NEXT:    lw a1, 0(a2)
+; RISCV32-NEXT:    lw a7, 4(a2)
+; RISCV32-NEXT:    lw a5, 8(a2)
 ; RISCV32-NEXT:    lw a2, 12(a2)
-; RISCV32-NEXT:    mulhu t1, a4, a6
-; RISCV32-NEXT:    mul t2, t0, a6
-; RISCV32-NEXT:    mulhu t3, t0, a6
-; RISCV32-NEXT:    mul t4, a4, a5
-; RISCV32-NEXT:    mulhu t5, a4, a5
+; RISCV32-NEXT:    or t4, a4, a6
+; RISCV32-NEXT:    beqz t4, .LBB0_5
+; RISCV32-NEXT:  # %bb.1: # %overflow.lhs
+; RISCV32-NEXT:    or t5, a5, a2
+; RISCV32-NEXT:    beqz t5, .LBB0_9
+; RISCV32-NEXT:  # %bb.2: # %overflow
+; RISCV32-NEXT:    mulhu t1, a3, a1
+; RISCV32-NEXT:    mul t2, t0, a1
+; RISCV32-NEXT:    mulhu t3, t0, a1
+; RISCV32-NEXT:    mul t6, a3, a7
+; RISCV32-NEXT:    mulhu s0, a3, a7
+; RISCV32-NEXT:    mul s4, t0, a7
+; RISCV32-NEXT:    mul s1, a5, a3
+; RISCV32-NEXT:    mul s5, a4, a1
 ; RISCV32-NEXT:    mul s2, t0, a5
-; RISCV32-NEXT:    mul t6, a7, a4
-; RISCV32-NEXT:    mul s3, a3, a6
-; RISCV32-NEXT:    mul s0, t0, a7
-; RISCV32-NEXT:    mul s1, a2, a4
-; RISCV32-NEXT:    mul s4, a5, a3
-; RISCV32-NEXT:    add s1, s1, s0
-; RISCV32-NEXT:    mul s0, a1, a6
-; RISCV32-NEXT:    add s4, s0, s4
-; RISCV32-NEXT:    mulhu s5, t0, a5
+; RISCV32-NEXT:    mul s3, a2, a3
+; RISCV32-NEXT:    mul s6, a7, a4
+; RISCV32-NEXT:    add s3, s3, s2
+; RISCV32-NEXT:    mul s2, a6, a1
+; RISCV32-NEXT:    add s6, s2, s6
+; RISCV32-NEXT:    mulhu s7, t0, a7
 ; RISCV32-NEXT:    add t1, t2, t1
 ; RISCV32-NEXT:    sltu t2, t1, t2
 ; RISCV32-NEXT:    add t2, t3, t2
-; RISCV32-NEXT:    mulhu s0, a7, a4
-; RISCV32-NEXT:    add t1, t4, t1
-; RISCV32-NEXT:    sltu t3, t1, t4
-; RISCV32-NEXT:    add t3, t5, t3
-; RISCV32-NEXT:    mulhu t5, a3, a6
-; RISCV32-NEXT:    add t4, s3, t6
-; RISCV32-NEXT:    add s1, s0, s1
-; RISCV32-NEXT:    add t6, t5, s4
-; RISCV32-NEXT:    sltu s3, t4, s3
+; RISCV32-NEXT:    mulhu s2, a5, a3
+; RISCV32-NEXT:    add t1, t6, t1
+; RISCV32-NEXT:    sltu t3, t1, t6
+; RISCV32-NEXT:    add t3, s0, t3
+; RISCV32-NEXT:    mulhu s0, a4, a1
+; RISCV32-NEXT:    add t6, s5, s1
+; RISCV32-NEXT:    add s3, s2, s3
+; RISCV32-NEXT:    add s1, s0, s6
+; RISCV32-NEXT:    sltu s5, t6, s5
 ; RISCV32-NEXT:    add t3, t2, t3
 ; RISCV32-NEXT:    sltu t2, t3, t2
-; RISCV32-NEXT:    add s5, s5, t2
-; RISCV32-NEXT:    add s4, t6, s1
-; RISCV32-NEXT:    add t3, s2, t3
-; RISCV32-NEXT:    add t2, t3, t4
-; RISCV32-NEXT:    sltu s2, t3, s2
-; RISCV32-NEXT:    sltu t4, t2, t3
-; RISCV32-NEXT:    add s2, s5, s2
-; RISCV32-NEXT:    add s3, s4, s3
-; RISCV32-NEXT:    add t3, s2, s3
-; RISCV32-NEXT:    add t3, t3, t4
-; RISCV32-NEXT:    beq t3, s2, .LBB0_2
-; RISCV32-NEXT:  # %bb.1: # %start
-; RISCV32-NEXT:    sltu t4, t3, s2
-; RISCV32-NEXT:  .LBB0_2: # %start
-; RISCV32-NEXT:    sltu s0, s1, s0
-; RISCV32-NEXT:    snez s1, t0
-; RISCV32-NEXT:    snez s2, a2
-; RISCV32-NEXT:    sltu t5, t6, t5
-; RISCV32-NEXT:    mulhu t6, a2, a4
-; RISCV32-NEXT:    mulhu t0, t0, a7
-; RISCV32-NEXT:    or a2, a7, a2
-; RISCV32-NEXT:    snez a7, a5
-; RISCV32-NEXT:    mul a4, a4, a6
-; RISCV32-NEXT:    mulhu a6, a1, a6
-; RISCV32-NEXT:    mulhu a5, a5, a3
-; RISCV32-NEXT:    or a3, a3, a1
-; RISCV32-NEXT:    snez a1, a1
-; RISCV32-NEXT:    and s1, s2, s1
-; RISCV32-NEXT:    snez t6, t6
-; RISCV32-NEXT:    snez t0, t0
-; RISCV32-NEXT:    and a1, a1, a7
-; RISCV32-NEXT:    snez a6, a6
-; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    add s7, s7, t2
+; RISCV32-NEXT:    add s6, s1, s3
+; RISCV32-NEXT:    add t3, s4, t3
+; RISCV32-NEXT:    add t2, t3, t6
+; RISCV32-NEXT:    sltu s4, t3, s4
+; RISCV32-NEXT:    sltu t6, t2, t3
+; RISCV32-NEXT:    add s4, s7, s4
+; RISCV32-NEXT:    add s5, s6, s5
+; RISCV32-NEXT:    add t3, s4, s5
+; RISCV32-NEXT:    add t3, t3, t6
+; RISCV32-NEXT:    beq t3, s4, .LBB0_4
+; RISCV32-NEXT:  # %bb.3: # %overflow
+; RISCV32-NEXT:    sltu t6, t3, s4
+; RISCV32-NEXT:  .LBB0_4: # %overflow
+; RISCV32-NEXT:    sltu s2, s3, s2
+; RISCV32-NEXT:    snez s3, t0
+; RISCV32-NEXT:    snez s4, a2
+; RISCV32-NEXT:    mulhu a2, a2, a3
+; RISCV32-NEXT:    mulhu a5, t0, a5
+; RISCV32-NEXT:    sltu t0, s1, s0
+; RISCV32-NEXT:    snez s0, a7
+; RISCV32-NEXT:    snez s1, a6
+; RISCV32-NEXT:    mulhu a6, a6, a1
+; RISCV32-NEXT:    mulhu a4, a7, a4
+; RISCV32-NEXT:    snez a7, t5
+; RISCV32-NEXT:    snez t4, t4
+; RISCV32-NEXT:    and t5, s4, s3
 ; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    snez a3, a3
-; RISCV32-NEXT:    or a7, s1, t6
-; RISCV32-NEXT:    or a1, a1, a6
-; RISCV32-NEXT:    and a2, a3, a2
-; RISCV32-NEXT:    or a3, a7, t0
-; RISCV32-NEXT:    or a1, a1, a5
-; RISCV32-NEXT:    or a3, a3, s0
-; RISCV32-NEXT:    or a1, a1, t5
-; RISCV32-NEXT:    or a1, a2, a1
-; RISCV32-NEXT:    or a1, a1, a3
-; RISCV32-NEXT:    or a1, a1, t4
-; RISCV32-NEXT:    andi a1, a1, 1
-; RISCV32-NEXT:    sw a4, 0(a0)
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    and s0, s1, s0
+; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    snez a4, a4
+; RISCV32-NEXT:    and a7, t4, a7
+; RISCV32-NEXT:    or a2, t5, a2
+; RISCV32-NEXT:    or a6, s0, a6
+; RISCV32-NEXT:    or a2, a2, a5
+; RISCV32-NEXT:    or a4, a6, a4
+; RISCV32-NEXT:    or a2, a2, s2
+; RISCV32-NEXT:    or a4, a4, t0
+; RISCV32-NEXT:    or a4, a7, a4
+; RISCV32-NEXT:    or a2, a4, a2
+; RISCV32-NEXT:    or t4, a2, t6
+; RISCV32-NEXT:    j .LBB0_14
+; RISCV32-NEXT:  .LBB0_5: # %overflow.no.lhs
+; RISCV32-NEXT:    or t1, a5, a2
+; RISCV32-NEXT:    beqz t1, .LBB0_13
+; RISCV32-NEXT:  # %bb.6: # %overflow.no.lhs.only
+; RISCV32-NEXT:    mulhu t1, a3, a1
+; RISCV32-NEXT:    mul t6, t0, a1
+; RISCV32-NEXT:    mulhu s0, t0, a1
+; RISCV32-NEXT:    mul t4, a3, a7
+; RISCV32-NEXT:    mulhu t5, a3, a7
+; RISCV32-NEXT:    mul t2, t0, a7
+; RISCV32-NEXT:    mulhu t3, t0, a7
+; RISCV32-NEXT:    mulhu s1, a1, a4
+; RISCV32-NEXT:    mul s2, a1, a6
+; RISCV32-NEXT:    mul a7, a7, a4
+; RISCV32-NEXT:    add s1, s1, s2
+; RISCV32-NEXT:    mulhu s2, a5, a4
+; RISCV32-NEXT:    mul a6, a5, a6
+; RISCV32-NEXT:    add a6, s2, a6
+; RISCV32-NEXT:    mulhu s2, a3, a5
+; RISCV32-NEXT:    add a7, s1, a7
+; RISCV32-NEXT:    mul s1, a2, a4
+; RISCV32-NEXT:    add a6, a6, s1
+; RISCV32-NEXT:    mul s1, t0, a5
+; RISCV32-NEXT:    add t1, t6, t1
+; RISCV32-NEXT:    sltu t6, t1, t6
+; RISCV32-NEXT:    add t6, s0, t6
+; RISCV32-NEXT:    mulhu s0, t0, a5
+; RISCV32-NEXT:    add s2, s1, s2
+; RISCV32-NEXT:    sltu s1, s2, s1
+; RISCV32-NEXT:    add s0, s0, s1
+; RISCV32-NEXT:    mul s1, a3, a2
+; RISCV32-NEXT:    add t1, t4, t1
+; RISCV32-NEXT:    sltu t4, t1, t4
+; RISCV32-NEXT:    add t4, t5, t4
+; RISCV32-NEXT:    mul t5, t0, a2
+; RISCV32-NEXT:    mulhu t0, t0, a2
+; RISCV32-NEXT:    mulhu a2, a3, a2
+; RISCV32-NEXT:    add s2, s1, s2
+; RISCV32-NEXT:    sltu s1, s2, s1
+; RISCV32-NEXT:    add a2, a2, s1
+; RISCV32-NEXT:    mul s1, a1, a4
+; RISCV32-NEXT:    mul a4, a5, a4
+; RISCV32-NEXT:    mul a5, a3, a5
+; RISCV32-NEXT:    add t4, t6, t4
+; RISCV32-NEXT:    add a2, s0, a2
+; RISCV32-NEXT:    sltu t6, t4, t6
+; RISCV32-NEXT:    add t4, t2, t4
+; RISCV32-NEXT:    sltu s0, a2, s0
+; RISCV32-NEXT:    add s3, t5, a2
+; RISCV32-NEXT:    add s1, t4, s1
+; RISCV32-NEXT:    sltu t2, t4, t2
+; RISCV32-NEXT:    add t3, t3, t6
+; RISCV32-NEXT:    add a2, s3, a4
+; RISCV32-NEXT:    sltu a4, s3, t5
+; RISCV32-NEXT:    add t0, t0, s0
+; RISCV32-NEXT:    sltu t4, s1, t4
+; RISCV32-NEXT:    add t3, t3, t2
+; RISCV32-NEXT:    sltu t5, a2, s3
+; RISCV32-NEXT:    add a4, t0, a4
+; RISCV32-NEXT:    add t2, s1, a5
+; RISCV32-NEXT:    add a7, t3, a7
+; RISCV32-NEXT:    add a5, a4, a6
+; RISCV32-NEXT:    sltu a4, t2, s1
+; RISCV32-NEXT:    add a6, a7, t4
+; RISCV32-NEXT:    add t3, s2, a4
+; RISCV32-NEXT:    add t3, a6, t3
+; RISCV32-NEXT:    add a5, a5, t5
+; RISCV32-NEXT:    beq t3, a6, .LBB0_8
+; RISCV32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; RISCV32-NEXT:    sltu a4, t3, a6
+; RISCV32-NEXT:  .LBB0_8: # %overflow.no.lhs.only
+; RISCV32-NEXT:    mul a1, a3, a1
+; RISCV32-NEXT:    j .LBB0_12
+; RISCV32-NEXT:  .LBB0_9: # %overflow.no.rhs.only
+; RISCV32-NEXT:    mulhu t1, a1, a3
+; RISCV32-NEXT:    mul t6, a7, a3
+; RISCV32-NEXT:    mulhu s0, a7, a3
+; RISCV32-NEXT:    mul t4, a1, t0
+; RISCV32-NEXT:    mulhu t5, a1, t0
+; RISCV32-NEXT:    mul t2, a7, t0
+; RISCV32-NEXT:    mulhu t3, a7, t0
+; RISCV32-NEXT:    mulhu s1, a3, a5
+; RISCV32-NEXT:    mul s2, a3, a2
+; RISCV32-NEXT:    mul t0, t0, a5
+; RISCV32-NEXT:    add s1, s1, s2
+; RISCV32-NEXT:    mulhu s2, a4, a5
+; RISCV32-NEXT:    mul a2, a4, a2
+; RISCV32-NEXT:    add a2, s2, a2
+; RISCV32-NEXT:    mulhu s2, a1, a4
+; RISCV32-NEXT:    add t0, s1, t0
+; RISCV32-NEXT:    mul s1, a6, a5
+; RISCV32-NEXT:    add s1, a2, s1
+; RISCV32-NEXT:    mul a2, a7, a4
+; RISCV32-NEXT:    add t1, t6, t1
+; RISCV32-NEXT:    sltu t6, t1, t6
+; RISCV32-NEXT:    add t6, s0, t6
+; RISCV32-NEXT:    mulhu s0, a7, a4
+; RISCV32-NEXT:    add s2, a2, s2
+; RISCV32-NEXT:    sltu a2, s2, a2
+; RISCV32-NEXT:    add a2, s0, a2
+; RISCV32-NEXT:    mul s0, a1, a6
+; RISCV32-NEXT:    add t1, t4, t1
+; RISCV32-NEXT:    sltu t4, t1, t4
+; RISCV32-NEXT:    add t4, t5, t4
+; RISCV32-NEXT:    mul t5, a7, a6
+; RISCV32-NEXT:    mulhu a7, a7, a6
+; RISCV32-NEXT:    mulhu a6, a1, a6
+; RISCV32-NEXT:    add s2, s0, s2
+; RISCV32-NEXT:    sltu s0, s2, s0
+; RISCV32-NEXT:    add a6, a6, s0
+; RISCV32-NEXT:    mul s0, a3, a5
+; RISCV32-NEXT:    mul a5, a4, a5
+; RISCV32-NEXT:    mul a4, a1, a4
+; RISCV32-NEXT:    add t4, t6, t4
+; RISCV32-NEXT:    add a6, a2, a6
+; RISCV32-NEXT:    sltu t6, t4, t6
+; RISCV32-NEXT:    add t4, t2, t4
+; RISCV32-NEXT:    sltu s3, a6, a2
+; RISCV32-NEXT:    add a6, t5, a6
+; RISCV32-NEXT:    add s0, t4, s0
+; RISCV32-NEXT:    sltu t2, t4, t2
+; RISCV32-NEXT:    add t3, t3, t6
+; RISCV32-NEXT:    add a2, a6, a5
+; RISCV32-NEXT:    sltu a5, a6, t5
+; RISCV32-NEXT:    add a7, a7, s3
+; RISCV32-NEXT:    sltu t4, s0, t4
+; RISCV32-NEXT:    add t3, t3, t2
+; RISCV32-NEXT:    sltu t5, a2, a6
+; RISCV32-NEXT:    add a5, a7, a5
+; RISCV32-NEXT:    add t2, s0, a4
+; RISCV32-NEXT:    add a6, t3, t0
+; RISCV32-NEXT:    add a5, a5, s1
+; RISCV32-NEXT:    sltu a4, t2, s0
+; RISCV32-NEXT:    add a6, a6, t4
+; RISCV32-NEXT:    add t3, s2, a4
+; RISCV32-NEXT:    add t3, a6, t3
+; RISCV32-NEXT:    add a5, a5, t5
+; RISCV32-NEXT:    beq t3, a6, .LBB0_11
+; RISCV32-NEXT:  # %bb.10: # %overflow.no.rhs.only
+; RISCV32-NEXT:    sltu a4, t3, a6
+; RISCV32-NEXT:  .LBB0_11: # %overflow.no.rhs.only
+; RISCV32-NEXT:    mul a1, a1, a3
+; RISCV32-NEXT:  .LBB0_12: # %overflow.res
+; RISCV32-NEXT:    add a4, a2, a4
+; RISCV32-NEXT:    sltu a2, a4, a2
+; RISCV32-NEXT:    add a2, a5, a2
+; RISCV32-NEXT:    or a2, a4, a2
+; RISCV32-NEXT:    snez t4, a2
+; RISCV32-NEXT:    j .LBB0_15
+; RISCV32-NEXT:  .LBB0_13: # %overflow.no
+; RISCV32-NEXT:    li t4, 0
+; RISCV32-NEXT:    mulhu t1, a3, a1
+; RISCV32-NEXT:    mul t2, t0, a1
+; RISCV32-NEXT:    mulhu t3, t0, a1
+; RISCV32-NEXT:    mul t5, a3, a7
+; RISCV32-NEXT:    mulhu t6, a3, a7
+; RISCV32-NEXT:    mul s0, t0, a7
+; RISCV32-NEXT:    mul s1, a5, t0
+; RISCV32-NEXT:    mulhu s2, a5, a3
+; RISCV32-NEXT:    add s1, s2, s1
+; RISCV32-NEXT:    mul s2, a1, a4
+; RISCV32-NEXT:    mul a5, a5, a3
+; RISCV32-NEXT:    mulhu t0, t0, a7
+; RISCV32-NEXT:    mul a2, a2, a3
+; RISCV32-NEXT:    mul a7, a7, a4
+; RISCV32-NEXT:    mulhu a4, a1, a4
+; RISCV32-NEXT:    mul a6, a1, a6
+; RISCV32-NEXT:    add t1, t2, t1
+; RISCV32-NEXT:    add s2, a5, s2
+; RISCV32-NEXT:    add a4, a4, a6
+; RISCV32-NEXT:    sltu a6, t1, t2
+; RISCV32-NEXT:    add t1, t5, t1
+; RISCV32-NEXT:    add a2, s1, a2
+; RISCV32-NEXT:    add a4, a4, a7
+; RISCV32-NEXT:    sltu a5, s2, a5
+; RISCV32-NEXT:    add a6, t3, a6
+; RISCV32-NEXT:    sltu a7, t1, t5
+; RISCV32-NEXT:    add a2, a2, a4
+; RISCV32-NEXT:    add a7, t6, a7
+; RISCV32-NEXT:    add a2, a2, a5
+; RISCV32-NEXT:    add a7, a6, a7
+; RISCV32-NEXT:    add a4, s0, a7
+; RISCV32-NEXT:    sltu a5, a7, a6
+; RISCV32-NEXT:    add t2, a4, s2
+; RISCV32-NEXT:    sltu a6, a4, s0
+; RISCV32-NEXT:    add a5, t0, a5
+; RISCV32-NEXT:    sltu t3, t2, a4
+; RISCV32-NEXT:    add a5, a5, a6
+; RISCV32-NEXT:    add a2, a5, a2
+; RISCV32-NEXT:    add t3, a2, t3
+; RISCV32-NEXT:  .LBB0_14: # %overflow.res
+; RISCV32-NEXT:    mul a1, a3, a1
+; RISCV32-NEXT:  .LBB0_15: # %overflow.res
+; RISCV32-NEXT:    andi a2, t4, 1
+; RISCV32-NEXT:    sw a1, 0(a0)
 ; RISCV32-NEXT:    sw t1, 4(a0)
 ; RISCV32-NEXT:    sw t2, 8(a0)
 ; RISCV32-NEXT:    sw t3, 12(a0)
-; RISCV32-NEXT:    sb a1, 16(a0)
+; RISCV32-NEXT:    sb a2, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
+; RISCV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
+; RISCV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    addi sp, sp, 32
 ; RISCV32-NEXT:    ret
 start:
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 2751332c9e3ae..5ff0cffb598dc 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1314,38 +1314,173 @@ entry:
 
 define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: smulo.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a6, a0, 31
+; RV32-NEXT:    srai a5, a2, 31
+; RV32-NEXT:    beq a1, a6, .LBB21_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a5, .LBB21_6
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mulhu a5, a0, a2
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    mulhsu a7, a1, a2
 ; RV32-NEXT:    mul t0, a3, a0
 ; RV32-NEXT:    mulh t1, a1, a3
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    mul t2, a1, a3
 ; RV32-NEXT:    mulhsu a3, a3, a0
-; RV32-NEXT:    mul a2, a0, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    sltu a0, a5, a6
-; RV32-NEXT:    add a5, t0, a5
-; RV32-NEXT:    add a0, a7, a0
-; RV32-NEXT:    sltu a6, a5, t0
-; RV32-NEXT:    srai a7, a5, 31
+; RV32-NEXT:    add a1, a6, a5
+; RV32-NEXT:    sltu a5, a1, a6
+; RV32-NEXT:    add a1, t0, a1
+; RV32-NEXT:    add a5, a7, a5
+; RV32-NEXT:    sltu a6, a1, t0
 ; RV32-NEXT:    add a3, a3, a6
-; RV32-NEXT:    srai a6, a0, 31
-; RV32-NEXT:    add t0, a0, a3
-; RV32-NEXT:    srai a3, a3, 31
-; RV32-NEXT:    sltu a0, t0, a0
+; RV32-NEXT:    srai a6, a5, 31
+; RV32-NEXT:    srai a7, a3, 31
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    srai a7, a1, 31
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    sltu a5, a3, a5
+; RV32-NEXT:    add a3, t2, a3
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    sltu a6, a3, t2
+; RV32-NEXT:    xor a3, a3, a7
+; RV32-NEXT:    add a5, t1, a5
+; RV32-NEXT:    add a5, a5, a6
+; RV32-NEXT:    xor a5, a5, a7
+; RV32-NEXT:    or a3, a3, a5
+; RV32-NEXT:    snez a5, a3
+; RV32-NEXT:    j .LBB21_9
+; RV32-NEXT:  .LBB21_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a5, .LBB21_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB21_10
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a0
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    bgez a1, .LBB21_11
+; RV32-NEXT:    j .LBB21_12
+; RV32-NEXT:  .LBB21_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB21_14
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    bgez a3, .LBB21_15
+; RV32-NEXT:    j .LBB21_16
+; RV32-NEXT:  .LBB21_8: # %overflow.no
+; RV32-NEXT:    li a5, 0
+; RV32-NEXT:    mulhu a6, a0, a2
+; RV32-NEXT:    mul a3, a0, a3
 ; RV32-NEXT:    add a3, a6, a3
-; RV32-NEXT:    add t0, a1, t0
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:  .LBB21_9: # %overflow.res
+; RV32-NEXT:    mul a2, a0, a2
+; RV32-NEXT:    j .LBB21_27
+; RV32-NEXT:  .LBB21_10:
+; RV32-NEXT:    neg a5, a0
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    neg a7, a1
+; RV32-NEXT:    sub a6, a7, a6
+; RV32-NEXT:    bltz a1, .LBB21_12
+; RV32-NEXT:  .LBB21_11: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    mv a5, a0
+; RV32-NEXT:  .LBB21_12: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB21_18
+; RV32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    j .LBB21_19
+; RV32-NEXT:  .LBB21_14:
+; RV32-NEXT:    neg a5, a2
+; RV32-NEXT:    snez a6, a2
+; RV32-NEXT:    neg a7, a3
+; RV32-NEXT:    sub a6, a7, a6
+; RV32-NEXT:    bltz a3, .LBB21_16
+; RV32-NEXT:  .LBB21_15: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    mv a5, a2
+; RV32-NEXT:  .LBB21_16: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB21_22
+; RV32-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    j .LBB21_23
+; RV32-NEXT:  .LBB21_18:
+; RV32-NEXT:    neg a7, a2
+; RV32-NEXT:    snez a0, a2
+; RV32-NEXT:    neg t0, a3
+; RV32-NEXT:    sub a0, t0, a0
+; RV32-NEXT:  .LBB21_19: # %overflow.no.lhs.only
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    slti t0, a3, 0
+; RV32-NEXT:    bltz a3, .LBB21_21
+; RV32-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:  .LBB21_21: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a2, a5, a7
+; RV32-NEXT:    mul a3, a6, a7
+; RV32-NEXT:    mul a7, a5, a7
+; RV32-NEXT:    mul a6, a6, a0
+; RV32-NEXT:    mulhu t1, a5, a0
+; RV32-NEXT:    mul a0, a5, a0
+; RV32-NEXT:    xor a1, t0, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a6, t1, a6
+; RV32-NEXT:    neg a3, a1
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    xor a5, a7, a3
+; RV32-NEXT:    sltu a7, a0, a2
+; RV32-NEXT:    add a2, a5, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    sltu a5, a2, a1
+; RV32-NEXT:    add a1, a0, a5
+; RV32-NEXT:    sltu a0, a1, a5
+; RV32-NEXT:    xor a3, a6, a3
 ; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    sltu a1, t0, a1
-; RV32-NEXT:    xor a3, t0, a7
-; RV32-NEXT:    add a0, t1, a0
+; RV32-NEXT:    j .LBB21_26
+; RV32-NEXT:  .LBB21_22:
+; RV32-NEXT:    neg a7, a0
+; RV32-NEXT:    snez a2, a0
+; RV32-NEXT:    neg t0, a1
+; RV32-NEXT:    sub a2, t0, a2
+; RV32-NEXT:  .LBB21_23: # %overflow.no.rhs.only
+; RV32-NEXT:    slti a3, a3, 0
+; RV32-NEXT:    slti t0, a1, 0
+; RV32-NEXT:    bltz a1, .LBB21_25
+; RV32-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:  .LBB21_25: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a0, a5, a7
+; RV32-NEXT:    mul a1, a6, a7
+; RV32-NEXT:    mul a7, a5, a7
+; RV32-NEXT:    mul a6, a6, a2
+; RV32-NEXT:    mulhu t1, a5, a2
+; RV32-NEXT:    mul a2, a5, a2
+; RV32-NEXT:    xor a3, a3, t0
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    xor a0, a0, a7
-; RV32-NEXT:    or a0, a3, a0
-; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    add a6, t1, a6
+; RV32-NEXT:    neg a5, a3
+; RV32-NEXT:    add a1, a0, a2
+; RV32-NEXT:    xor a2, a7, a5
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    add a0, a6, a0
+; RV32-NEXT:    sltu a3, a2, a3
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    sltu a3, a1, a3
+; RV32-NEXT:    xor a0, a0, a5
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:  .LBB21_26: # %overflow.res
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:  .LBB21_27: # %overflow.res
+; RV32-NEXT:    andi a0, a5, 1
 ; RV32-NEXT:    sw a2, 0(a4)
-; RV32-NEXT:    sw a5, 4(a4)
+; RV32-NEXT:    sw a1, 4(a4)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.i64:
@@ -1359,38 +1494,173 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a6, a0, 31
+; RV32ZBA-NEXT:    srai a5, a2, 31
+; RV32ZBA-NEXT:    beq a1, a6, .LBB21_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a5, .LBB21_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mulhu a5, a0, a2
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a7, a1, a2
 ; RV32ZBA-NEXT:    mul t0, a3, a0
 ; RV32ZBA-NEXT:    mulh t1, a1, a3
-; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    mul t2, a1, a3
 ; RV32ZBA-NEXT:    mulhsu a3, a3, a0
-; RV32ZBA-NEXT:    mul a2, a0, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    sltu a0, a5, a6
-; RV32ZBA-NEXT:    add a5, t0, a5
-; RV32ZBA-NEXT:    add a0, a7, a0
-; RV32ZBA-NEXT:    sltu a6, a5, t0
-; RV32ZBA-NEXT:    srai a7, a5, 31
+; RV32ZBA-NEXT:    add a1, a6, a5
+; RV32ZBA-NEXT:    sltu a5, a1, a6
+; RV32ZBA-NEXT:    add a1, t0, a1
+; RV32ZBA-NEXT:    add a5, a7, a5
+; RV32ZBA-NEXT:    sltu a6, a1, t0
 ; RV32ZBA-NEXT:    add a3, a3, a6
-; RV32ZBA-NEXT:    srai a6, a0, 31
-; RV32ZBA-NEXT:    add t0, a0, a3
-; RV32ZBA-NEXT:    srai a3, a3, 31
-; RV32ZBA-NEXT:    sltu a0, t0, a0
+; RV32ZBA-NEXT:    srai a6, a5, 31
+; RV32ZBA-NEXT:    srai a7, a3, 31
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    srai a7, a1, 31
+; RV32ZBA-NEXT:    add a3, a5, a3
+; RV32ZBA-NEXT:    sltu a5, a3, a5
+; RV32ZBA-NEXT:    add a3, t2, a3
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    sltu a6, a3, t2
+; RV32ZBA-NEXT:    xor a3, a3, a7
+; RV32ZBA-NEXT:    add a5, t1, a5
+; RV32ZBA-NEXT:    add a5, a5, a6
+; RV32ZBA-NEXT:    xor a5, a5, a7
+; RV32ZBA-NEXT:    or a3, a3, a5
+; RV32ZBA-NEXT:    snez a5, a3
+; RV32ZBA-NEXT:    j .LBB21_9
+; RV32ZBA-NEXT:  .LBB21_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a5, .LBB21_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB21_10
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a0
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB21_11
+; RV32ZBA-NEXT:    j .LBB21_12
+; RV32ZBA-NEXT:  .LBB21_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB21_14
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a2
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB21_15
+; RV32ZBA-NEXT:    j .LBB21_16
+; RV32ZBA-NEXT:  .LBB21_8: # %overflow.no
+; RV32ZBA-NEXT:    li a5, 0
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
+; RV32ZBA-NEXT:    mul a3, a0, a3
 ; RV32ZBA-NEXT:    add a3, a6, a3
-; RV32ZBA-NEXT:    add t0, a1, t0
+; RV32ZBA-NEXT:    mul a1, a1, a2
+; RV32ZBA-NEXT:    add a1, a3, a1
+; RV32ZBA-NEXT:  .LBB21_9: # %overflow.res
+; RV32ZBA-NEXT:    mul a2, a0, a2
+; RV32ZBA-NEXT:    j .LBB21_27
+; RV32ZBA-NEXT:  .LBB21_10:
+; RV32ZBA-NEXT:    neg a5, a0
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    neg a7, a1
+; RV32ZBA-NEXT:    sub a6, a7, a6
+; RV32ZBA-NEXT:    bltz a1, .LBB21_12
+; RV32ZBA-NEXT:  .LBB21_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    mv a5, a0
+; RV32ZBA-NEXT:  .LBB21_12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB21_18
+; RV32ZBA-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    j .LBB21_19
+; RV32ZBA-NEXT:  .LBB21_14:
+; RV32ZBA-NEXT:    neg a5, a2
+; RV32ZBA-NEXT:    snez a6, a2
+; RV32ZBA-NEXT:    neg a7, a3
+; RV32ZBA-NEXT:    sub a6, a7, a6
+; RV32ZBA-NEXT:    bltz a3, .LBB21_16
+; RV32ZBA-NEXT:  .LBB21_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    mv a5, a2
+; RV32ZBA-NEXT:  .LBB21_16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB21_22
+; RV32ZBA-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    j .LBB21_23
+; RV32ZBA-NEXT:  .LBB21_18:
+; RV32ZBA-NEXT:    neg a7, a2
+; RV32ZBA-NEXT:    snez a0, a2
+; RV32ZBA-NEXT:    neg t0, a3
+; RV32ZBA-NEXT:    sub a0, t0, a0
+; RV32ZBA-NEXT:  .LBB21_19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    slti t0, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB21_21
+; RV32ZBA-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:  .LBB21_21: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a2, a5, a7
+; RV32ZBA-NEXT:    mul a3, a6, a7
+; RV32ZBA-NEXT:    mul a7, a5, a7
+; RV32ZBA-NEXT:    mul a6, a6, a0
+; RV32ZBA-NEXT:    mulhu t1, a5, a0
+; RV32ZBA-NEXT:    mul a0, a5, a0
+; RV32ZBA-NEXT:    xor a1, t0, a1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a6, t1, a6
+; RV32ZBA-NEXT:    neg a3, a1
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    xor a5, a7, a3
+; RV32ZBA-NEXT:    sltu a7, a0, a2
+; RV32ZBA-NEXT:    add a2, a5, a1
+; RV32ZBA-NEXT:    xor a0, a0, a3
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    sltu a5, a2, a1
+; RV32ZBA-NEXT:    add a1, a0, a5
+; RV32ZBA-NEXT:    sltu a0, a1, a5
+; RV32ZBA-NEXT:    xor a3, a6, a3
 ; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    sltu a1, t0, a1
-; RV32ZBA-NEXT:    xor a3, t0, a7
-; RV32ZBA-NEXT:    add a0, t1, a0
+; RV32ZBA-NEXT:    j .LBB21_26
+; RV32ZBA-NEXT:  .LBB21_22:
+; RV32ZBA-NEXT:    neg a7, a0
+; RV32ZBA-NEXT:    snez a2, a0
+; RV32ZBA-NEXT:    neg t0, a1
+; RV32ZBA-NEXT:    sub a2, t0, a2
+; RV32ZBA-NEXT:  .LBB21_23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti a3, a3, 0
+; RV32ZBA-NEXT:    slti t0, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB21_25
+; RV32ZBA-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:  .LBB21_25: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a0, a5, a7
+; RV32ZBA-NEXT:    mul a1, a6, a7
+; RV32ZBA-NEXT:    mul a7, a5, a7
+; RV32ZBA-NEXT:    mul a6, a6, a2
+; RV32ZBA-NEXT:    mulhu t1, a5, a2
+; RV32ZBA-NEXT:    mul a2, a5, a2
+; RV32ZBA-NEXT:    xor a3, a3, t0
 ; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    xor a0, a0, a7
-; RV32ZBA-NEXT:    or a0, a3, a0
-; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    add a6, t1, a6
+; RV32ZBA-NEXT:    neg a5, a3
+; RV32ZBA-NEXT:    add a1, a0, a2
+; RV32ZBA-NEXT:    xor a2, a7, a5
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    xor a1, a1, a5
+; RV32ZBA-NEXT:    add a0, a6, a0
+; RV32ZBA-NEXT:    sltu a3, a2, a3
+; RV32ZBA-NEXT:    add a1, a1, a3
+; RV32ZBA-NEXT:    sltu a3, a1, a3
+; RV32ZBA-NEXT:    xor a0, a0, a5
+; RV32ZBA-NEXT:    add a0, a0, a3
+; RV32ZBA-NEXT:  .LBB21_26: # %overflow.res
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:  .LBB21_27: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a5, 1
 ; RV32ZBA-NEXT:    sw a2, 0(a4)
-; RV32ZBA-NEXT:    sw a5, 4(a4)
+; RV32ZBA-NEXT:    sw a1, 4(a4)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.i64:
@@ -1404,38 +1674,165 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a6, a0, 31
+; RV32ZICOND-NEXT:    srai a5, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a6, .LBB21_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a5, .LBB21_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a2
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a7, a1, a2
 ; RV32ZICOND-NEXT:    mul t0, a3, a0
 ; RV32ZICOND-NEXT:    mulh t1, a1, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mul t2, a1, a3
 ; RV32ZICOND-NEXT:    mulhsu a3, a3, a0
-; RV32ZICOND-NEXT:    mul a2, a0, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    sltu a0, a5, a6
-; RV32ZICOND-NEXT:    add a5, t0, a5
-; RV32ZICOND-NEXT:    add a0, a7, a0
-; RV32ZICOND-NEXT:    sltu a6, a5, t0
-; RV32ZICOND-NEXT:    srai a7, a5, 31
+; RV32ZICOND-NEXT:    add a1, a6, a5
+; RV32ZICOND-NEXT:    sltu a5, a1, a6
+; RV32ZICOND-NEXT:    add a1, t0, a1
+; RV32ZICOND-NEXT:    add a5, a7, a5
+; RV32ZICOND-NEXT:    sltu a6, a1, t0
 ; RV32ZICOND-NEXT:    add a3, a3, a6
-; RV32ZICOND-NEXT:    srai a6, a0, 31
-; RV32ZICOND-NEXT:    add t0, a0, a3
-; RV32ZICOND-NEXT:    srai a3, a3, 31
-; RV32ZICOND-NEXT:    sltu a0, t0, a0
-; RV32ZICOND-NEXT:    add a3, a6, a3
-; RV32ZICOND-NEXT:    add t0, a1, t0
+; RV32ZICOND-NEXT:    srai a6, a5, 31
+; RV32ZICOND-NEXT:    srai a7, a3, 31
+; RV32ZICOND-NEXT:    add a6, a6, a7
+; RV32ZICOND-NEXT:    srai a7, a1, 31
+; RV32ZICOND-NEXT:    add a3, a5, a3
+; RV32ZICOND-NEXT:    sltu a5, a3, a5
+; RV32ZICOND-NEXT:    add a3, t2, a3
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    sltu a6, a3, t2
+; RV32ZICOND-NEXT:    xor a3, a3, a7
+; RV32ZICOND-NEXT:    add a5, t1, a5
+; RV32ZICOND-NEXT:    add a5, a5, a6
+; RV32ZICOND-NEXT:    xor a5, a5, a7
+; RV32ZICOND-NEXT:    or a3, a3, a5
+; RV32ZICOND-NEXT:    snez a5, a3
+; RV32ZICOND-NEXT:    j .LBB21_7
+; RV32ZICOND-NEXT:  .LBB21_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a5, .LBB21_6
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a5, a1, 0
+; RV32ZICOND-NEXT:    neg a6, a0
+; RV32ZICOND-NEXT:    snez a7, a0
+; RV32ZICOND-NEXT:    neg t0, a1
+; RV32ZICOND-NEXT:    snez t1, a2
+; RV32ZICOND-NEXT:    sub a7, t0, a7
+; RV32ZICOND-NEXT:    neg t0, a3
+; RV32ZICOND-NEXT:    sub t0, t0, t1
+; RV32ZICOND-NEXT:    slti t1, a3, 0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a5
+; RV32ZICOND-NEXT:    or a6, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    or a0, a6, a0
+; RV32ZICOND-NEXT:    neg a6, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a5
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.nez a2, a2, t1
+; RV32ZICOND-NEXT:    czero.nez a3, a3, t1
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    or a7, a7, a1
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    xor a5, t1, a5
+; RV32ZICOND-NEXT:    or a6, a6, a2
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    or t0, t0, a3
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    neg t1, a5
+; RV32ZICOND-NEXT:    or a2, a6, a2
+; RV32ZICOND-NEXT:    or a1, a7, a1
+; RV32ZICOND-NEXT:    or a3, t0, a3
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    mul a7, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mulhu t0, a0, a3
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    xor a3, a7, t1
+; RV32ZICOND-NEXT:    add a6, a6, a2
+; RV32ZICOND-NEXT:    add a1, t0, a1
+; RV32ZICOND-NEXT:    add a2, a3, a5
+; RV32ZICOND-NEXT:    add a0, a6, a0
+; RV32ZICOND-NEXT:    sltu a3, a2, a5
+; RV32ZICOND-NEXT:    sltu a5, a0, a6
+; RV32ZICOND-NEXT:    xor a0, a0, t1
+; RV32ZICOND-NEXT:    add a5, a1, a5
+; RV32ZICOND-NEXT:    add a1, a0, a3
+; RV32ZICOND-NEXT:    sltu a0, a1, a3
+; RV32ZICOND-NEXT:    xor a3, a5, t1
 ; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    sltu a1, t0, a1
-; RV32ZICOND-NEXT:    xor a3, t0, a7
-; RV32ZICOND-NEXT:    add a0, t1, a0
-; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:    xor a0, a0, a7
-; RV32ZICOND-NEXT:    or a0, a3, a0
-; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    snez a5, a0
+; RV32ZICOND-NEXT:    j .LBB21_8
+; RV32ZICOND-NEXT:  .LBB21_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a5, a3, 0
+; RV32ZICOND-NEXT:    neg a6, a2
+; RV32ZICOND-NEXT:    snez a7, a2
+; RV32ZICOND-NEXT:    neg t0, a3
+; RV32ZICOND-NEXT:    snez t1, a0
+; RV32ZICOND-NEXT:    sub a7, t0, a7
+; RV32ZICOND-NEXT:    neg t0, a1
+; RV32ZICOND-NEXT:    sub t0, t0, t1
+; RV32ZICOND-NEXT:    slti t1, a1, 0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a5
+; RV32ZICOND-NEXT:    or a6, a6, a2
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
+; RV32ZICOND-NEXT:    or a2, a6, a2
+; RV32ZICOND-NEXT:    neg a6, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a5
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.nez a0, a0, t1
+; RV32ZICOND-NEXT:    czero.nez a1, a1, t1
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    or a7, a7, a3
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
+; RV32ZICOND-NEXT:    xor a5, a5, t1
+; RV32ZICOND-NEXT:    or a6, a6, a0
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    or t0, t0, a1
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
+; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
+; RV32ZICOND-NEXT:    neg t1, a5
+; RV32ZICOND-NEXT:    or a0, a6, a0
+; RV32ZICOND-NEXT:    or a3, a7, a3
+; RV32ZICOND-NEXT:    or a1, t0, a1
+; RV32ZICOND-NEXT:    mulhu a6, a2, a0
+; RV32ZICOND-NEXT:    mul a7, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    mulhu t0, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    xor a2, a7, t1
+; RV32ZICOND-NEXT:    add a0, a6, a0
+; RV32ZICOND-NEXT:    add a3, t0, a3
+; RV32ZICOND-NEXT:    add a2, a2, a5
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a5, a2, a5
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a1, t1
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:    add a1, a1, a5
+; RV32ZICOND-NEXT:    sltu a3, a1, a5
+; RV32ZICOND-NEXT:    xor a0, a0, t1
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    snez a5, a0
+; RV32ZICOND-NEXT:    j .LBB21_8
+; RV32ZICOND-NEXT:  .LBB21_6: # %overflow.no
+; RV32ZICOND-NEXT:    li a5, 0
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    add a3, a6, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a2
+; RV32ZICOND-NEXT:    add a1, a3, a1
+; RV32ZICOND-NEXT:  .LBB21_7: # %overflow.res
+; RV32ZICOND-NEXT:    mul a2, a0, a2
+; RV32ZICOND-NEXT:  .LBB21_8: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a5, 1
 ; RV32ZICOND-NEXT:    sw a2, 0(a4)
-; RV32ZICOND-NEXT:    sw a5, 4(a4)
+; RV32ZICOND-NEXT:    sw a1, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.i64:
@@ -1457,23 +1854,57 @@ entry:
 
 define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: smulo2.i64:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mulhu a4, a0, a3
-; RV32-NEXT:    mul a5, a1, a3
-; RV32-NEXT:    mulh a1, a1, a3
-; RV32-NEXT:    mul a3, a0, a3
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    sltu a0, a4, a5
-; RV32-NEXT:    srai a5, a4, 31
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    xor a1, a0, a5
-; RV32-NEXT:    srai a0, a0, 31
-; RV32-NEXT:    xor a0, a0, a5
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    sw a3, 0(a2)
-; RV32-NEXT:    sw a4, 4(a2)
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a3, a0, 31
+; RV32-NEXT:    beq a1, a3, .LBB22_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    bltz a1, .LBB22_4
+; RV32-NEXT:  # %bb.2: # %overflow.lhs
+; RV32-NEXT:    mv a3, a0
+; RV32-NEXT:    mv a4, a1
+; RV32-NEXT:    bgez a1, .LBB22_5
+; RV32-NEXT:    j .LBB22_6
+; RV32-NEXT:  .LBB22_3: # %overflow.no.lhs
+; RV32-NEXT:    li a4, 0
+; RV32-NEXT:    li a5, 13
+; RV32-NEXT:    mulhu a3, a0, a5
+; RV32-NEXT:    mul a1, a1, a5
+; RV32-NEXT:    add a3, a3, a1
+; RV32-NEXT:    mul a1, a0, a5
+; RV32-NEXT:    j .LBB22_7
+; RV32-NEXT:  .LBB22_4:
+; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    snez a4, a0
+; RV32-NEXT:    neg a5, a1
+; RV32-NEXT:    sub a4, a5, a4
+; RV32-NEXT:    bltz a1, .LBB22_6
+; RV32-NEXT:  .LBB22_5: # %overflow.lhs
+; RV32-NEXT:    mv a4, a1
+; RV32-NEXT:    mv a3, a0
+; RV32-NEXT:  .LBB22_6: # %overflow.lhs
+; RV32-NEXT:    li a0, 13
+; RV32-NEXT:    mul a5, a3, a0
+; RV32-NEXT:    mulhu a3, a3, a0
+; RV32-NEXT:    mulhu a6, a4, a0
+; RV32-NEXT:    mul a0, a4, a0
+; RV32-NEXT:    srai a4, a1, 31
+; RV32-NEXT:    srli a7, a1, 31
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:    xor a1, a5, a4
+; RV32-NEXT:    sltu a3, a0, a3
+; RV32-NEXT:    add a1, a1, a7
+; RV32-NEXT:    xor a0, a0, a4
+; RV32-NEXT:    add a6, a6, a3
+; RV32-NEXT:    sltu a5, a1, a7
+; RV32-NEXT:    add a3, a0, a5
+; RV32-NEXT:    sltu a0, a3, a5
+; RV32-NEXT:    xor a4, a6, a4
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    snez a4, a0
+; RV32-NEXT:  .LBB22_7: # %overflow.res
+; RV32-NEXT:    andi a0, a4, 1
+; RV32-NEXT:    sw a1, 0(a2)
+; RV32-NEXT:    sw a3, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.i64:
@@ -1488,25 +1919,61 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo2.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    li a3, 13
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a3, a0, 31
+; RV32ZBA-NEXT:    beq a1, a3, .LBB22_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    bltz a1, .LBB22_4
+; RV32ZBA-NEXT:  # %bb.2: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a3, a0
+; RV32ZBA-NEXT:    mv a4, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB22_5
+; RV32ZBA-NEXT:    j .LBB22_6
+; RV32ZBA-NEXT:  .LBB22_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    li a3, 0
 ; RV32ZBA-NEXT:    sh1add a4, a1, a1
-; RV32ZBA-NEXT:    sh1add a5, a0, a0
 ; RV32ZBA-NEXT:    sh2add a4, a4, a1
-; RV32ZBA-NEXT:    mulh a1, a1, a3
-; RV32ZBA-NEXT:    mulhu a3, a0, a3
-; RV32ZBA-NEXT:    sh2add a5, a5, a0
+; RV32ZBA-NEXT:    li a1, 13
+; RV32ZBA-NEXT:    mulhu a1, a0, a1
+; RV32ZBA-NEXT:    add a4, a1, a4
+; RV32ZBA-NEXT:    sh1add a1, a0, a0
+; RV32ZBA-NEXT:    sh2add a1, a1, a0
+; RV32ZBA-NEXT:    j .LBB22_7
+; RV32ZBA-NEXT:  .LBB22_4:
+; RV32ZBA-NEXT:    neg a3, a0
+; RV32ZBA-NEXT:    snez a4, a0
+; RV32ZBA-NEXT:    neg a5, a1
+; RV32ZBA-NEXT:    sub a4, a5, a4
+; RV32ZBA-NEXT:    bltz a1, .LBB22_6
+; RV32ZBA-NEXT:  .LBB22_5: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a4, a1
+; RV32ZBA-NEXT:    mv a3, a0
+; RV32ZBA-NEXT:  .LBB22_6: # %overflow.lhs
+; RV32ZBA-NEXT:    sh1add a0, a3, a3
+; RV32ZBA-NEXT:    li a5, 13
+; RV32ZBA-NEXT:    sh1add a6, a4, a4
+; RV32ZBA-NEXT:    sh2add a0, a0, a3
+; RV32ZBA-NEXT:    mulhu a3, a3, a5
+; RV32ZBA-NEXT:    sh2add a6, a6, a4
+; RV32ZBA-NEXT:    mulhu a4, a4, a5
+; RV32ZBA-NEXT:    srai a5, a1, 31
+; RV32ZBA-NEXT:    srli a7, a1, 31
+; RV32ZBA-NEXT:    add a6, a3, a6
+; RV32ZBA-NEXT:    xor a0, a0, a5
+; RV32ZBA-NEXT:    sltu a3, a6, a3
+; RV32ZBA-NEXT:    add a1, a0, a7
+; RV32ZBA-NEXT:    xor a0, a6, a5
 ; RV32ZBA-NEXT:    add a3, a4, a3
-; RV32ZBA-NEXT:    sltu a0, a3, a4
-; RV32ZBA-NEXT:    srai a4, a3, 31
-; RV32ZBA-NEXT:    add a0, a1, a0
-; RV32ZBA-NEXT:    xor a1, a0, a4
-; RV32ZBA-NEXT:    srai a0, a0, 31
-; RV32ZBA-NEXT:    xor a0, a0, a4
-; RV32ZBA-NEXT:    or a0, a1, a0
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    sw a5, 0(a2)
-; RV32ZBA-NEXT:    sw a3, 4(a2)
+; RV32ZBA-NEXT:    sltu a6, a1, a7
+; RV32ZBA-NEXT:    add a4, a0, a6
+; RV32ZBA-NEXT:    sltu a0, a4, a6
+; RV32ZBA-NEXT:    xor a3, a3, a5
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:    snez a3, a0
+; RV32ZBA-NEXT:  .LBB22_7: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a3, 1
+; RV32ZBA-NEXT:    sw a1, 0(a2)
+; RV32ZBA-NEXT:    sw a4, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.i64:
@@ -1522,23 +1989,56 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo2.i64:
-; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    li a3, 13
-; RV32ZICOND-NEXT:    mulhu a4, a0, a3
-; RV32ZICOND-NEXT:    mul a5, a1, a3
-; RV32ZICOND-NEXT:    mulh a1, a1, a3
-; RV32ZICOND-NEXT:    mul a3, a0, a3
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    sltu a0, a4, a5
-; RV32ZICOND-NEXT:    srai a5, a4, 31
-; RV32ZICOND-NEXT:    add a0, a1, a0
-; RV32ZICOND-NEXT:    xor a1, a0, a5
-; RV32ZICOND-NEXT:    srai a0, a0, 31
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a3, a0, 31
+; RV32ZICOND-NEXT:    beq a1, a3, .LBB22_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    slti a3, a1, 0
+; RV32ZICOND-NEXT:    neg a4, a0
+; RV32ZICOND-NEXT:    snez a5, a0
+; RV32ZICOND-NEXT:    neg a6, a1
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a3
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a3
+; RV32ZICOND-NEXT:    sub a5, a6, a5
+; RV32ZICOND-NEXT:    czero.nez a6, a1, a3
+; RV32ZICOND-NEXT:    or a4, a4, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a3
+; RV32ZICOND-NEXT:    or a5, a5, a6
+; RV32ZICOND-NEXT:    czero.eqz a4, a4, a3
+; RV32ZICOND-NEXT:    czero.eqz a3, a5, a3
+; RV32ZICOND-NEXT:    li a5, 13
+; RV32ZICOND-NEXT:    or a0, a4, a0
+; RV32ZICOND-NEXT:    or a3, a3, a6
+; RV32ZICOND-NEXT:    mul a4, a0, a5
+; RV32ZICOND-NEXT:    mulhu a0, a0, a5
+; RV32ZICOND-NEXT:    mulhu a6, a3, a5
+; RV32ZICOND-NEXT:    mul a3, a3, a5
+; RV32ZICOND-NEXT:    srai a5, a1, 31
+; RV32ZICOND-NEXT:    srli a7, a1, 31
+; RV32ZICOND-NEXT:    xor a1, a4, a5
+; RV32ZICOND-NEXT:    add a3, a0, a3
+; RV32ZICOND-NEXT:    add a1, a1, a7
+; RV32ZICOND-NEXT:    sltu a0, a3, a0
+; RV32ZICOND-NEXT:    sltu a4, a1, a7
+; RV32ZICOND-NEXT:    xor a3, a3, a5
+; RV32ZICOND-NEXT:    add a0, a6, a0
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a4, a3, a4
 ; RV32ZICOND-NEXT:    xor a0, a0, a5
-; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    sw a3, 0(a2)
-; RV32ZICOND-NEXT:    sw a4, 4(a2)
+; RV32ZICOND-NEXT:    add a0, a0, a4
+; RV32ZICOND-NEXT:    snez a4, a0
+; RV32ZICOND-NEXT:    j .LBB22_3
+; RV32ZICOND-NEXT:  .LBB22_2: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    li a4, 0
+; RV32ZICOND-NEXT:    li a5, 13
+; RV32ZICOND-NEXT:    mulhu a3, a0, a5
+; RV32ZICOND-NEXT:    mul a1, a1, a5
+; RV32ZICOND-NEXT:    add a3, a3, a1
+; RV32ZICOND-NEXT:    mul a1, a0, a5
+; RV32ZICOND-NEXT:  .LBB22_3: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a4, 1
+; RV32ZICOND-NEXT:    sw a1, 0(a2)
+; RV32ZICOND-NEXT:    sw a3, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.i64:
@@ -1766,26 +2266,71 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
 
 define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: umulo.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB26_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB26_5
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mul a5, a3, a0
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    mulhu a7, a0, a2
 ; RV32-NEXT:    snez t0, a3
-; RV32-NEXT:    mulhu a3, a3, a0
-; RV32-NEXT:    mul t1, a0, a2
-; RV32-NEXT:    mulhu a0, a1, a2
-; RV32-NEXT:    snez a1, a1
 ; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    and a1, a1, t0
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    snez a2, a3
-; RV32-NEXT:    add a5, a7, a5
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    sltu a1, a5, a7
-; RV32-NEXT:    or a0, a0, a2
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    sw t1, 0(a4)
-; RV32-NEXT:    sw a5, 4(a4)
+; RV32-NEXT:    snez a6, a1
+; RV32-NEXT:    mulhu a1, a1, a2
+; RV32-NEXT:    mulhu a3, a3, a0
+; RV32-NEXT:    and a6, a6, t0
+; RV32-NEXT:    snez t0, a1
+; RV32-NEXT:    snez a3, a3
+; RV32-NEXT:    add a1, a7, a5
+; RV32-NEXT:    or a5, a6, t0
+; RV32-NEXT:    sltu a6, a1, a7
+; RV32-NEXT:    or a3, a5, a3
+; RV32-NEXT:    or a6, a3, a6
+; RV32-NEXT:    j .LBB26_7
+; RV32-NEXT:  .LBB26_3: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB26_6
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a6, a0, a2
+; RV32-NEXT:    mul a7, a1, a2
+; RV32-NEXT:    mul a5, a0, a2
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    mulhu a2, a0, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    mul a1, a0, a3
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    sltu a0, a1, a6
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    j .LBB26_8
+; RV32-NEXT:  .LBB26_5: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a6, a2, a0
+; RV32-NEXT:    mul a7, a3, a0
+; RV32-NEXT:    mul a5, a2, a0
+; RV32-NEXT:    add a6, a6, a7
+; RV32-NEXT:    mulhu a0, a2, a1
+; RV32-NEXT:    mul a3, a3, a1
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    mul a1, a2, a1
+; RV32-NEXT:    add a1, a6, a1
+; RV32-NEXT:    sltu a2, a1, a6
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    j .LBB26_8
+; RV32-NEXT:  .LBB26_6: # %overflow.no
+; RV32-NEXT:    li a6, 0
+; RV32-NEXT:    mulhu a5, a0, a2
+; RV32-NEXT:    mul a3, a0, a3
+; RV32-NEXT:    add a3, a5, a3
+; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:  .LBB26_7: # %overflow.res
+; RV32-NEXT:    mul a5, a0, a2
+; RV32-NEXT:  .LBB26_8: # %overflow.res
+; RV32-NEXT:    andi a0, a6, 1
+; RV32-NEXT:    sw a5, 0(a4)
+; RV32-NEXT:    sw a1, 4(a4)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.i64:
@@ -1798,26 +2343,71 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB26_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB26_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mul a5, a3, a0
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    mulhu a7, a0, a2
 ; RV32ZBA-NEXT:    snez t0, a3
-; RV32ZBA-NEXT:    mulhu a3, a3, a0
-; RV32ZBA-NEXT:    mul t1, a0, a2
-; RV32ZBA-NEXT:    mulhu a0, a1, a2
-; RV32ZBA-NEXT:    snez a1, a1
 ; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    and a1, a1, t0
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    snez a2, a3
-; RV32ZBA-NEXT:    add a5, a7, a5
-; RV32ZBA-NEXT:    or a0, a1, a0
-; RV32ZBA-NEXT:    sltu a1, a5, a7
-; RV32ZBA-NEXT:    or a0, a0, a2
-; RV32ZBA-NEXT:    or a0, a0, a1
-; RV32ZBA-NEXT:    sw t1, 0(a4)
-; RV32ZBA-NEXT:    sw a5, 4(a4)
+; RV32ZBA-NEXT:    snez a6, a1
+; RV32ZBA-NEXT:    mulhu a1, a1, a2
+; RV32ZBA-NEXT:    mulhu a3, a3, a0
+; RV32ZBA-NEXT:    and a6, a6, t0
+; RV32ZBA-NEXT:    snez t0, a1
+; RV32ZBA-NEXT:    snez a3, a3
+; RV32ZBA-NEXT:    add a1, a7, a5
+; RV32ZBA-NEXT:    or a5, a6, t0
+; RV32ZBA-NEXT:    sltu a6, a1, a7
+; RV32ZBA-NEXT:    or a3, a5, a3
+; RV32ZBA-NEXT:    or a6, a3, a6
+; RV32ZBA-NEXT:    j .LBB26_7
+; RV32ZBA-NEXT:  .LBB26_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB26_6
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a6, a0, a2
+; RV32ZBA-NEXT:    mul a7, a1, a2
+; RV32ZBA-NEXT:    mul a5, a0, a2
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    mulhu a2, a0, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    add a2, a2, a1
+; RV32ZBA-NEXT:    mul a1, a0, a3
+; RV32ZBA-NEXT:    add a1, a6, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a6
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    j .LBB26_8
+; RV32ZBA-NEXT:  .LBB26_5: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a6, a2, a0
+; RV32ZBA-NEXT:    mul a7, a3, a0
+; RV32ZBA-NEXT:    mul a5, a2, a0
+; RV32ZBA-NEXT:    add a6, a6, a7
+; RV32ZBA-NEXT:    mulhu a0, a2, a1
+; RV32ZBA-NEXT:    mul a3, a3, a1
+; RV32ZBA-NEXT:    add a0, a0, a3
+; RV32ZBA-NEXT:    mul a1, a2, a1
+; RV32ZBA-NEXT:    add a1, a6, a1
+; RV32ZBA-NEXT:    sltu a2, a1, a6
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    j .LBB26_8
+; RV32ZBA-NEXT:  .LBB26_6: # %overflow.no
+; RV32ZBA-NEXT:    li a6, 0
+; RV32ZBA-NEXT:    mulhu a5, a0, a2
+; RV32ZBA-NEXT:    mul a3, a0, a3
+; RV32ZBA-NEXT:    add a3, a5, a3
+; RV32ZBA-NEXT:    mul a1, a1, a2
+; RV32ZBA-NEXT:    add a1, a3, a1
+; RV32ZBA-NEXT:  .LBB26_7: # %overflow.res
+; RV32ZBA-NEXT:    mul a5, a0, a2
+; RV32ZBA-NEXT:  .LBB26_8: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a6, 1
+; RV32ZBA-NEXT:    sw a5, 0(a4)
+; RV32ZBA-NEXT:    sw a1, 4(a4)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.i64:
@@ -1830,26 +2420,71 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB26_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB26_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mul a5, a3, a0
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a7, a0, a2
 ; RV32ZICOND-NEXT:    snez t0, a3
-; RV32ZICOND-NEXT:    mulhu a3, a3, a0
-; RV32ZICOND-NEXT:    mul t1, a0, a2
-; RV32ZICOND-NEXT:    mulhu a0, a1, a2
-; RV32ZICOND-NEXT:    snez a1, a1
 ; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    and a1, a1, t0
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    snez a2, a3
-; RV32ZICOND-NEXT:    add a5, a7, a5
-; RV32ZICOND-NEXT:    or a0, a1, a0
-; RV32ZICOND-NEXT:    sltu a1, a5, a7
-; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    sw t1, 0(a4)
-; RV32ZICOND-NEXT:    sw a5, 4(a4)
+; RV32ZICOND-NEXT:    snez a6, a1
+; RV32ZICOND-NEXT:    mulhu a1, a1, a2
+; RV32ZICOND-NEXT:    mulhu a3, a3, a0
+; RV32ZICOND-NEXT:    and a6, a6, t0
+; RV32ZICOND-NEXT:    snez t0, a1
+; RV32ZICOND-NEXT:    snez a3, a3
+; RV32ZICOND-NEXT:    add a1, a7, a5
+; RV32ZICOND-NEXT:    or a5, a6, t0
+; RV32ZICOND-NEXT:    sltu a6, a1, a7
+; RV32ZICOND-NEXT:    or a3, a5, a3
+; RV32ZICOND-NEXT:    or a6, a3, a6
+; RV32ZICOND-NEXT:    j .LBB26_7
+; RV32ZICOND-NEXT:  .LBB26_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB26_6
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a6, a0, a2
+; RV32ZICOND-NEXT:    mul a7, a1, a2
+; RV32ZICOND-NEXT:    mul a5, a0, a2
+; RV32ZICOND-NEXT:    add a6, a6, a7
+; RV32ZICOND-NEXT:    mulhu a2, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a2, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a0, a3
+; RV32ZICOND-NEXT:    add a1, a6, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a6
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    j .LBB26_8
+; RV32ZICOND-NEXT:  .LBB26_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a6, a2, a0
+; RV32ZICOND-NEXT:    mul a7, a3, a0
+; RV32ZICOND-NEXT:    mul a5, a2, a0
+; RV32ZICOND-NEXT:    add a6, a6, a7
+; RV32ZICOND-NEXT:    mulhu a0, a2, a1
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    add a1, a6, a1
+; RV32ZICOND-NEXT:    sltu a2, a1, a6
+; RV32ZICOND-NEXT:    add a0, a0, a2
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    j .LBB26_8
+; RV32ZICOND-NEXT:  .LBB26_6: # %overflow.no
+; RV32ZICOND-NEXT:    li a6, 0
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
+; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    add a3, a5, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a2
+; RV32ZICOND-NEXT:    add a1, a3, a1
+; RV32ZICOND-NEXT:  .LBB26_7: # %overflow.res
+; RV32ZICOND-NEXT:    mul a5, a0, a2
+; RV32ZICOND-NEXT:  .LBB26_8: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a6, 1
+; RV32ZICOND-NEXT:    sw a5, 0(a4)
+; RV32ZICOND-NEXT:    sw a1, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.i64:
@@ -1870,18 +2505,30 @@ entry:
 
 define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: umulo2.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB27_2
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    li a4, 13
+; RV32-NEXT:    mul a3, a0, a4
+; RV32-NEXT:    mulhu a0, a0, a4
+; RV32-NEXT:    mulhu a5, a1, a4
+; RV32-NEXT:    mul a1, a1, a4
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    snez a4, a0
+; RV32-NEXT:    j .LBB27_3
+; RV32-NEXT:  .LBB27_2: # %overflow.no.lhs
+; RV32-NEXT:    li a4, 0
 ; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a4, a1, a3
 ; RV32-NEXT:    mulhu a5, a0, a3
-; RV32-NEXT:    mulhu a1, a1, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, a5, a1
 ; RV32-NEXT:    mul a3, a0, a3
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    snez a0, a1
-; RV32-NEXT:    sltu a1, a4, a5
-; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:  .LBB27_3: # %overflow.res
+; RV32-NEXT:    andi a0, a4, 1
 ; RV32-NEXT:    sw a3, 0(a2)
-; RV32-NEXT:    sw a4, 4(a2)
+; RV32-NEXT:    sw a1, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo2.i64:
@@ -1895,20 +2542,34 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo2.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    li a3, 13
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB27_2
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    sh1add a3, a0, a0
+; RV32ZBA-NEXT:    li a5, 13
+; RV32ZBA-NEXT:    sh1add a6, a1, a1
+; RV32ZBA-NEXT:    sh2add a4, a3, a0
+; RV32ZBA-NEXT:    mulhu a0, a0, a5
+; RV32ZBA-NEXT:    mulhu a3, a1, a5
+; RV32ZBA-NEXT:    sh2add a1, a6, a1
+; RV32ZBA-NEXT:    add a1, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:    snez a3, a0
+; RV32ZBA-NEXT:    j .LBB27_3
+; RV32ZBA-NEXT:  .LBB27_2: # %overflow.no.lhs
+; RV32ZBA-NEXT:    li a3, 0
 ; RV32ZBA-NEXT:    sh1add a4, a1, a1
-; RV32ZBA-NEXT:    sh1add a5, a0, a0
-; RV32ZBA-NEXT:    sh2add a4, a4, a1
-; RV32ZBA-NEXT:    mulhu a1, a1, a3
-; RV32ZBA-NEXT:    mulhu a3, a0, a3
-; RV32ZBA-NEXT:    sh2add a5, a5, a0
-; RV32ZBA-NEXT:    add a4, a3, a4
-; RV32ZBA-NEXT:    snez a0, a1
-; RV32ZBA-NEXT:    sltu a1, a4, a3
-; RV32ZBA-NEXT:    or a0, a0, a1
-; RV32ZBA-NEXT:    sw a5, 0(a2)
-; RV32ZBA-NEXT:    sw a4, 4(a2)
+; RV32ZBA-NEXT:    sh2add a1, a4, a1
+; RV32ZBA-NEXT:    li a4, 13
+; RV32ZBA-NEXT:    mulhu a4, a0, a4
+; RV32ZBA-NEXT:    add a1, a4, a1
+; RV32ZBA-NEXT:    sh1add a4, a0, a0
+; RV32ZBA-NEXT:    sh2add a4, a4, a0
+; RV32ZBA-NEXT:  .LBB27_3: # %overflow.res
+; RV32ZBA-NEXT:    andi a0, a3, 1
+; RV32ZBA-NEXT:    sw a4, 0(a2)
+; RV32ZBA-NEXT:    sw a1, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo2.i64:
@@ -1923,18 +2584,30 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo2.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB27_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    li a4, 13
+; RV32ZICOND-NEXT:    mul a3, a0, a4
+; RV32ZICOND-NEXT:    mulhu a0, a0, a4
+; RV32ZICOND-NEXT:    mulhu a5, a1, a4
+; RV32ZICOND-NEXT:    mul a1, a1, a4
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    snez a4, a0
+; RV32ZICOND-NEXT:    j .LBB27_3
+; RV32ZICOND-NEXT:  .LBB27_2: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    li a4, 0
 ; RV32ZICOND-NEXT:    li a3, 13
-; RV32ZICOND-NEXT:    mul a4, a1, a3
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a3
-; RV32ZICOND-NEXT:    mulhu a1, a1, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a1, a5, a1
 ; RV32ZICOND-NEXT:    mul a3, a0, a3
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    snez a0, a1
-; RV32ZICOND-NEXT:    sltu a1, a4, a5
-; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:  .LBB27_3: # %overflow.res
+; RV32ZICOND-NEXT:    andi a0, a4, 1
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
-; RV32ZICOND-NEXT:    sw a4, 4(a2)
+; RV32ZICOND-NEXT:    sw a1, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo2.i64:
@@ -3218,7 +3891,13 @@ entry:
 
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.select.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a5, a0, 31
+; RV32-NEXT:    srai a4, a2, 31
+; RV32-NEXT:    beq a1, a5, .LBB46_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a4, .LBB46_6
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a6, a1, a2
@@ -3246,11 +3925,119 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a5, a5, a4
 ; RV32-NEXT:    xor a4, a6, a4
 ; RV32-NEXT:    or a4, a4, a5
-; RV32-NEXT:    bnez a4, .LBB46_2
-; RV32-NEXT:  # %bb.1: # %entry
+; RV32-NEXT:    j .LBB46_26
+; RV32-NEXT:  .LBB46_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a4, .LBB46_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB46_9
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    bgez a1, .LBB46_10
+; RV32-NEXT:    j .LBB46_11
+; RV32-NEXT:  .LBB46_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB46_13
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    bgez a3, .LBB46_14
+; RV32-NEXT:    j .LBB46_15
+; RV32-NEXT:  .LBB46_8: # %overflow.no
+; RV32-NEXT:    j .LBB46_27
+; RV32-NEXT:  .LBB46_9:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a1, .LBB46_11
+; RV32-NEXT:  .LBB46_10: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB46_11: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB46_17
+; RV32-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    j .LBB46_18
+; RV32-NEXT:  .LBB46_13:
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    snez a5, a2
+; RV32-NEXT:    neg a6, a3
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a3, .LBB46_15
+; RV32-NEXT:  .LBB46_14: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:  .LBB46_15: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB46_21
+; RV32-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    j .LBB46_22
+; RV32-NEXT:  .LBB46_17:
+; RV32-NEXT:    neg a7, a2
+; RV32-NEXT:    snez a6, a2
+; RV32-NEXT:    neg t0, a3
+; RV32-NEXT:    sub a6, t0, a6
+; RV32-NEXT:  .LBB46_18: # %overflow.no.lhs.only
+; RV32-NEXT:    slti t0, a1, 0
+; RV32-NEXT:    slti t1, a3, 0
+; RV32-NEXT:    bltz a3, .LBB46_20
+; RV32-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a3
+; RV32-NEXT:    mv a7, a2
+; RV32-NEXT:  .LBB46_20: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu t2, a4, a7
+; RV32-NEXT:    mul t3, a5, a7
+; RV32-NEXT:    mul a7, a4, a7
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    mulhu t4, a4, a6
+; RV32-NEXT:    mul a4, a4, a6
+; RV32-NEXT:    xor a6, t1, t0
+; RV32-NEXT:    j .LBB46_25
+; RV32-NEXT:  .LBB46_21:
+; RV32-NEXT:    neg a7, a0
+; RV32-NEXT:    snez a6, a0
+; RV32-NEXT:    neg t0, a1
+; RV32-NEXT:    sub a6, t0, a6
+; RV32-NEXT:  .LBB46_22: # %overflow.no.rhs.only
+; RV32-NEXT:    slti t0, a3, 0
+; RV32-NEXT:    slti t1, a1, 0
+; RV32-NEXT:    bltz a1, .LBB46_24
+; RV32-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a1
+; RV32-NEXT:    mv a7, a0
+; RV32-NEXT:  .LBB46_24: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu t2, a4, a7
+; RV32-NEXT:    mul t3, a5, a7
+; RV32-NEXT:    mul a7, a4, a7
+; RV32-NEXT:    mul a5, a5, a6
+; RV32-NEXT:    mulhu t4, a4, a6
+; RV32-NEXT:    mul a4, a4, a6
+; RV32-NEXT:    xor a6, t0, t1
+; RV32-NEXT:  .LBB46_25: # %overflow.res
+; RV32-NEXT:    add t2, t2, t3
+; RV32-NEXT:    add a5, t4, a5
+; RV32-NEXT:    neg t0, a6
+; RV32-NEXT:    add a4, t2, a4
+; RV32-NEXT:    xor a7, a7, t0
+; RV32-NEXT:    sltu t1, a4, t2
+; RV32-NEXT:    add a7, a7, a6
+; RV32-NEXT:    xor a4, a4, t0
+; RV32-NEXT:    add a5, a5, t1
+; RV32-NEXT:    sltu a6, a7, a6
+; RV32-NEXT:    add a4, a4, a6
+; RV32-NEXT:    sltu a4, a4, a6
+; RV32-NEXT:    xor a5, a5, t0
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:  .LBB46_26: # %overflow.res
+; RV32-NEXT:    snez a4, a4
+; RV32-NEXT:    andi a4, a4, 1
+; RV32-NEXT:    bnez a4, .LBB46_28
+; RV32-NEXT:  .LBB46_27: # %overflow.res
 ; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:  .LBB46_2: # %entry
+; RV32-NEXT:  .LBB46_28: # %overflow.res
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.select.i64:
@@ -3265,7 +4052,13 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.select.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a5, a0, 31
+; RV32ZBA-NEXT:    srai a4, a2, 31
+; RV32ZBA-NEXT:    beq a1, a5, .LBB46_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB46_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a6, a1, a2
@@ -3293,11 +4086,119 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a5, a5, a4
 ; RV32ZBA-NEXT:    xor a4, a6, a4
 ; RV32ZBA-NEXT:    or a4, a4, a5
-; RV32ZBA-NEXT:    bnez a4, .LBB46_2
-; RV32ZBA-NEXT:  # %bb.1: # %entry
+; RV32ZBA-NEXT:    j .LBB46_26
+; RV32ZBA-NEXT:  .LBB46_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB46_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB46_9
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB46_10
+; RV32ZBA-NEXT:    j .LBB46_11
+; RV32ZBA-NEXT:  .LBB46_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB46_13
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB46_14
+; RV32ZBA-NEXT:    j .LBB46_15
+; RV32ZBA-NEXT:  .LBB46_8: # %overflow.no
+; RV32ZBA-NEXT:    j .LBB46_27
+; RV32ZBA-NEXT:  .LBB46_9:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a1, .LBB46_11
+; RV32ZBA-NEXT:  .LBB46_10: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB46_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB46_17
+; RV32ZBA-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    j .LBB46_18
+; RV32ZBA-NEXT:  .LBB46_13:
+; RV32ZBA-NEXT:    neg a4, a2
+; RV32ZBA-NEXT:    snez a5, a2
+; RV32ZBA-NEXT:    neg a6, a3
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a3, .LBB46_15
+; RV32ZBA-NEXT:  .LBB46_14: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:  .LBB46_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB46_21
+; RV32ZBA-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    j .LBB46_22
+; RV32ZBA-NEXT:  .LBB46_17:
+; RV32ZBA-NEXT:    neg a7, a2
+; RV32ZBA-NEXT:    snez a6, a2
+; RV32ZBA-NEXT:    neg t0, a3
+; RV32ZBA-NEXT:    sub a6, t0, a6
+; RV32ZBA-NEXT:  .LBB46_18: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti t0, a1, 0
+; RV32ZBA-NEXT:    slti t1, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB46_20
+; RV32ZBA-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a3
+; RV32ZBA-NEXT:    mv a7, a2
+; RV32ZBA-NEXT:  .LBB46_20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu t2, a4, a7
+; RV32ZBA-NEXT:    mul t3, a5, a7
+; RV32ZBA-NEXT:    mul a7, a4, a7
+; RV32ZBA-NEXT:    mul a5, a5, a6
+; RV32ZBA-NEXT:    mulhu t4, a4, a6
+; RV32ZBA-NEXT:    mul a4, a4, a6
+; RV32ZBA-NEXT:    xor a6, t1, t0
+; RV32ZBA-NEXT:    j .LBB46_25
+; RV32ZBA-NEXT:  .LBB46_21:
+; RV32ZBA-NEXT:    neg a7, a0
+; RV32ZBA-NEXT:    snez a6, a0
+; RV32ZBA-NEXT:    neg t0, a1
+; RV32ZBA-NEXT:    sub a6, t0, a6
+; RV32ZBA-NEXT:  .LBB46_22: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti t0, a3, 0
+; RV32ZBA-NEXT:    slti t1, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB46_24
+; RV32ZBA-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a1
+; RV32ZBA-NEXT:    mv a7, a0
+; RV32ZBA-NEXT:  .LBB46_24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu t2, a4, a7
+; RV32ZBA-NEXT:    mul t3, a5, a7
+; RV32ZBA-NEXT:    mul a7, a4, a7
+; RV32ZBA-NEXT:    mul a5, a5, a6
+; RV32ZBA-NEXT:    mulhu t4, a4, a6
+; RV32ZBA-NEXT:    mul a4, a4, a6
+; RV32ZBA-NEXT:    xor a6, t0, t1
+; RV32ZBA-NEXT:  .LBB46_25: # %overflow.res
+; RV32ZBA-NEXT:    add t2, t2, t3
+; RV32ZBA-NEXT:    add a5, t4, a5
+; RV32ZBA-NEXT:    neg t0, a6
+; RV32ZBA-NEXT:    add a4, t2, a4
+; RV32ZBA-NEXT:    xor a7, a7, t0
+; RV32ZBA-NEXT:    sltu t1, a4, t2
+; RV32ZBA-NEXT:    add a7, a7, a6
+; RV32ZBA-NEXT:    xor a4, a4, t0
+; RV32ZBA-NEXT:    add a5, a5, t1
+; RV32ZBA-NEXT:    sltu a6, a7, a6
+; RV32ZBA-NEXT:    add a4, a4, a6
+; RV32ZBA-NEXT:    sltu a4, a4, a6
+; RV32ZBA-NEXT:    xor a5, a5, t0
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:  .LBB46_26: # %overflow.res
+; RV32ZBA-NEXT:    snez a4, a4
+; RV32ZBA-NEXT:    andi a4, a4, 1
+; RV32ZBA-NEXT:    bnez a4, .LBB46_28
+; RV32ZBA-NEXT:  .LBB46_27: # %overflow.res
 ; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    mv a1, a3
-; RV32ZBA-NEXT:  .LBB46_2: # %entry
+; RV32ZBA-NEXT:  .LBB46_28: # %overflow.res
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.select.i64:
@@ -3312,7 +4213,13 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.select.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a5, a0, 31
+; RV32ZICOND-NEXT:    srai a4, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a5, .LBB46_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB46_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a6, a1, a2
@@ -3335,11 +4242,99 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    srai a4, a4, 31
 ; RV32ZICOND-NEXT:    add a6, a7, a6
 ; RV32ZICOND-NEXT:    sltu a7, a6, a7
-; RV32ZICOND-NEXT:    xor a6, a6, a4
 ; RV32ZICOND-NEXT:    add a5, t0, a5
 ; RV32ZICOND-NEXT:    add a5, a5, a7
-; RV32ZICOND-NEXT:    xor a4, a5, a4
-; RV32ZICOND-NEXT:    or a4, a6, a4
+; RV32ZICOND-NEXT:    xor a5, a5, a4
+; RV32ZICOND-NEXT:    xor a4, a6, a4
+; RV32ZICOND-NEXT:    or a4, a4, a5
+; RV32ZICOND-NEXT:    j .LBB46_7
+; RV32ZICOND-NEXT:  .LBB46_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB46_8
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a4, a1, 0
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    slti t0, a3, 0
+; RV32ZICOND-NEXT:    neg t1, a2
+; RV32ZICOND-NEXT:    snez t2, a2
+; RV32ZICOND-NEXT:    neg t3, a3
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez t4, a0, a4
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    czero.nez a7, a1, a4
+; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
+; RV32ZICOND-NEXT:    sub t2, t3, t2
+; RV32ZICOND-NEXT:    czero.nez t3, a2, t0
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.nez t4, a3, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a7
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, t0, a4
+; RV32ZICOND-NEXT:    j .LBB46_6
+; RV32ZICOND-NEXT:  .LBB46_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a4, a3, 0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    snez a6, a2
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    slti t0, a1, 0
+; RV32ZICOND-NEXT:    neg t1, a0
+; RV32ZICOND-NEXT:    snez t2, a0
+; RV32ZICOND-NEXT:    neg t3, a1
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez t4, a2, a4
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    czero.nez a7, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
+; RV32ZICOND-NEXT:    sub t2, t3, t2
+; RV32ZICOND-NEXT:    czero.nez t3, a0, t0
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a5, a5, t4
+; RV32ZICOND-NEXT:    czero.nez t4, a1, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a7
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, a4, t0
+; RV32ZICOND-NEXT:  .LBB46_6: # %overflow.res
+; RV32ZICOND-NEXT:    or t1, t1, t3
+; RV32ZICOND-NEXT:    czero.eqz t2, t2, t0
+; RV32ZICOND-NEXT:    or t2, t2, t4
+; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
+; RV32ZICOND-NEXT:    czero.eqz t0, t2, t0
+; RV32ZICOND-NEXT:    or t1, t1, t3
+; RV32ZICOND-NEXT:    or a6, a6, a7
+; RV32ZICOND-NEXT:    or a7, t0, t4
+; RV32ZICOND-NEXT:    mulhu t0, a5, t1
+; RV32ZICOND-NEXT:    mul t2, a5, t1
+; RV32ZICOND-NEXT:    mul t1, a6, t1
+; RV32ZICOND-NEXT:    mul a6, a6, a7
+; RV32ZICOND-NEXT:    mulhu t3, a5, a7
+; RV32ZICOND-NEXT:    mul a5, a5, a7
+; RV32ZICOND-NEXT:    neg a7, a4
+; RV32ZICOND-NEXT:    xor t2, t2, a7
+; RV32ZICOND-NEXT:    add t0, t0, t1
+; RV32ZICOND-NEXT:    add a6, t3, a6
+; RV32ZICOND-NEXT:    add t2, t2, a4
+; RV32ZICOND-NEXT:    add a5, t0, a5
+; RV32ZICOND-NEXT:    sltu a4, t2, a4
+; RV32ZICOND-NEXT:    sltu t0, a5, t0
+; RV32ZICOND-NEXT:    xor a5, a5, a7
+; RV32ZICOND-NEXT:    add a6, a6, t0
+; RV32ZICOND-NEXT:    add a5, a5, a4
+; RV32ZICOND-NEXT:    sltu a4, a5, a4
+; RV32ZICOND-NEXT:    xor a5, a6, a7
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:  .LBB46_7: # %overflow.res
+; RV32ZICOND-NEXT:    snez a4, a4
+; RV32ZICOND-NEXT:    j .LBB46_9
+; RV32ZICOND-NEXT:  .LBB46_8: # %overflow.no
+; RV32ZICOND-NEXT:    li a4, 0
+; RV32ZICOND-NEXT:  .LBB46_9: # %overflow.res
+; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -3367,7 +4362,13 @@ entry:
 
 define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.not.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a5, a0, 31
+; RV32-NEXT:    srai a4, a2, 31
+; RV32-NEXT:    beq a1, a5, .LBB47_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a4, .LBB47_6
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a2, a1, a2
@@ -3395,7 +4396,128 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a4, a5, a4
 ; RV32-NEXT:    or a0, a4, a0
-; RV32-NEXT:    seqz a0, a0
+; RV32-NEXT:    j .LBB47_25
+; RV32-NEXT:  .LBB47_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a4, .LBB47_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB47_9
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    bgez a1, .LBB47_10
+; RV32-NEXT:    j .LBB47_11
+; RV32-NEXT:  .LBB47_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB47_13
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    bgez a3, .LBB47_14
+; RV32-NEXT:    j .LBB47_15
+; RV32-NEXT:  .LBB47_8: # %overflow.no
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB47_9:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a1, .LBB47_11
+; RV32-NEXT:  .LBB47_10: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB47_11: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB47_17
+; RV32-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    j .LBB47_18
+; RV32-NEXT:  .LBB47_13:
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    snez a5, a2
+; RV32-NEXT:    neg a6, a3
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a3, .LBB47_15
+; RV32-NEXT:  .LBB47_14: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:  .LBB47_15: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB47_21
+; RV32-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    j .LBB47_22
+; RV32-NEXT:  .LBB47_17:
+; RV32-NEXT:    neg a6, a2
+; RV32-NEXT:    snez a0, a2
+; RV32-NEXT:    neg a7, a3
+; RV32-NEXT:    sub a0, a7, a0
+; RV32-NEXT:  .LBB47_18: # %overflow.no.lhs.only
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    slti a7, a3, 0
+; RV32-NEXT:    bltz a3, .LBB47_20
+; RV32-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:  .LBB47_20: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a2, a4, a6
+; RV32-NEXT:    mul a3, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a0
+; RV32-NEXT:    mulhu t0, a4, a0
+; RV32-NEXT:    mul a0, a4, a0
+; RV32-NEXT:    xor a1, a7, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a3, a1
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    xor a4, a6, a3
+; RV32-NEXT:    sltu a2, a0, a2
+; RV32-NEXT:    add a4, a4, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    sltu a1, a4, a1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    j .LBB47_25
+; RV32-NEXT:  .LBB47_21:
+; RV32-NEXT:    neg a6, a0
+; RV32-NEXT:    snez a2, a0
+; RV32-NEXT:    neg a7, a1
+; RV32-NEXT:    sub a2, a7, a2
+; RV32-NEXT:  .LBB47_22: # %overflow.no.rhs.only
+; RV32-NEXT:    slti a3, a3, 0
+; RV32-NEXT:    slti a7, a1, 0
+; RV32-NEXT:    bltz a1, .LBB47_24
+; RV32-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:  .LBB47_24: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a0, a4, a6
+; RV32-NEXT:    mul a1, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a2
+; RV32-NEXT:    mulhu t0, a4, a2
+; RV32-NEXT:    mul a2, a4, a2
+; RV32-NEXT:    xor a3, a3, a7
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a1, a3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    xor a4, a6, a1
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a4, a4, a3
+; RV32-NEXT:    xor a2, a2, a1
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    sltu a3, a4, a3
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sltu a2, a2, a3
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:  .LBB47_25: # %overflow.res
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.not.i64:
@@ -3408,7 +4530,13 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.not.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a5, a0, 31
+; RV32ZBA-NEXT:    srai a4, a2, 31
+; RV32ZBA-NEXT:    beq a1, a5, .LBB47_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB47_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a2, a1, a2
@@ -3436,7 +4564,128 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a4, a5, a4
 ; RV32ZBA-NEXT:    or a0, a4, a0
-; RV32ZBA-NEXT:    seqz a0, a0
+; RV32ZBA-NEXT:    j .LBB47_25
+; RV32ZBA-NEXT:  .LBB47_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB47_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB47_9
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB47_10
+; RV32ZBA-NEXT:    j .LBB47_11
+; RV32ZBA-NEXT:  .LBB47_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB47_13
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB47_14
+; RV32ZBA-NEXT:    j .LBB47_15
+; RV32ZBA-NEXT:  .LBB47_8: # %overflow.no
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB47_9:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a1, .LBB47_11
+; RV32ZBA-NEXT:  .LBB47_10: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB47_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB47_17
+; RV32ZBA-NEXT:  # %bb.12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    j .LBB47_18
+; RV32ZBA-NEXT:  .LBB47_13:
+; RV32ZBA-NEXT:    neg a4, a2
+; RV32ZBA-NEXT:    snez a5, a2
+; RV32ZBA-NEXT:    neg a6, a3
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a3, .LBB47_15
+; RV32ZBA-NEXT:  .LBB47_14: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:  .LBB47_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB47_21
+; RV32ZBA-NEXT:  # %bb.16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    j .LBB47_22
+; RV32ZBA-NEXT:  .LBB47_17:
+; RV32ZBA-NEXT:    neg a6, a2
+; RV32ZBA-NEXT:    snez a0, a2
+; RV32ZBA-NEXT:    neg a7, a3
+; RV32ZBA-NEXT:    sub a0, a7, a0
+; RV32ZBA-NEXT:  .LBB47_18: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    slti a7, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB47_20
+; RV32ZBA-NEXT:  # %bb.19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:  .LBB47_20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a2, a4, a6
+; RV32ZBA-NEXT:    mul a3, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a0
+; RV32ZBA-NEXT:    mulhu t0, a4, a0
+; RV32ZBA-NEXT:    mul a0, a4, a0
+; RV32ZBA-NEXT:    xor a1, a7, a1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a3, a1
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    xor a4, a6, a3
+; RV32ZBA-NEXT:    sltu a2, a0, a2
+; RV32ZBA-NEXT:    add a4, a4, a1
+; RV32ZBA-NEXT:    xor a0, a0, a3
+; RV32ZBA-NEXT:    add a2, a5, a2
+; RV32ZBA-NEXT:    sltu a1, a4, a1
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a0, a1
+; RV32ZBA-NEXT:    xor a2, a2, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    j .LBB47_25
+; RV32ZBA-NEXT:  .LBB47_21:
+; RV32ZBA-NEXT:    neg a6, a0
+; RV32ZBA-NEXT:    snez a2, a0
+; RV32ZBA-NEXT:    neg a7, a1
+; RV32ZBA-NEXT:    sub a2, a7, a2
+; RV32ZBA-NEXT:  .LBB47_22: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti a3, a3, 0
+; RV32ZBA-NEXT:    slti a7, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB47_24
+; RV32ZBA-NEXT:  # %bb.23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:  .LBB47_24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a0, a4, a6
+; RV32ZBA-NEXT:    mul a1, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a2
+; RV32ZBA-NEXT:    mulhu t0, a4, a2
+; RV32ZBA-NEXT:    mul a2, a4, a2
+; RV32ZBA-NEXT:    xor a3, a3, a7
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a1, a3
+; RV32ZBA-NEXT:    add a2, a0, a2
+; RV32ZBA-NEXT:    xor a4, a6, a1
+; RV32ZBA-NEXT:    sltu a0, a2, a0
+; RV32ZBA-NEXT:    add a4, a4, a3
+; RV32ZBA-NEXT:    xor a2, a2, a1
+; RV32ZBA-NEXT:    add a0, a5, a0
+; RV32ZBA-NEXT:    sltu a3, a4, a3
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    sltu a2, a2, a3
+; RV32ZBA-NEXT:    xor a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:  .LBB47_25: # %overflow.res
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.not.i64:
@@ -3449,7 +4698,13 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.not.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a5, a0, 31
+; RV32ZICOND-NEXT:    srai a4, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a5, .LBB47_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB47_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
@@ -3477,7 +4732,120 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
 ; RV32ZICOND-NEXT:    xor a4, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a4, a0
-; RV32ZICOND-NEXT:    seqz a0, a0
+; RV32ZICOND-NEXT:    j .LBB47_6
+; RV32ZICOND-NEXT:  .LBB47_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB47_7
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a4, a1, 0
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    snez t0, a2
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a3, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, t0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a1
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, t0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a3
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    or a1, a6, a1
+; RV32ZICOND-NEXT:    or a3, a7, a3
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
+; RV32ZICOND-NEXT:    mul a6, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mulhu a7, a0, a3
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    xor a3, a6, t0
+; RV32ZICOND-NEXT:    add a2, a5, a2
+; RV32ZICOND-NEXT:    add a1, a7, a1
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a2, a0, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    sltu a0, a0, a3
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB47_6
+; RV32ZICOND-NEXT:  .LBB47_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a4, a3, 0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    snez a6, a2
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    snez t0, a0
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a1, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a3
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, a4, t0
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a1
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    or a3, a6, a3
+; RV32ZICOND-NEXT:    or a1, a7, a1
+; RV32ZICOND-NEXT:    mulhu a5, a2, a0
+; RV32ZICOND-NEXT:    mul a6, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    mulhu a7, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    xor a2, a6, t0
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    add a3, a7, a3
+; RV32ZICOND-NEXT:    add a2, a2, a4
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a2, a2, a4
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    sltu a1, a1, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:  .LBB47_6: # %overflow.res
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB47_7: # %overflow.no
+; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.not.i64:
@@ -3617,7 +4985,11 @@ entry:
 
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.select.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB50_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB50_5
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    snez a6, a3
@@ -3634,12 +5006,42 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    snez a6, a6
 ; RV32-NEXT:    or a5, a5, a6
 ; RV32-NEXT:    or a4, a5, a4
-; RV32-NEXT:    bnez a4, .LBB50_2
-; RV32-NEXT:  # %bb.1: # %entry
+; RV32-NEXT:    andi a4, a4, 1
+; RV32-NEXT:    beqz a4, .LBB50_7
+; RV32-NEXT:    j .LBB50_8
+; RV32-NEXT:  .LBB50_3: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB50_9
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a4, a0, a2
+; RV32-NEXT:    mul a5, a1, a2
+; RV32-NEXT:    mulhu a6, a0, a3
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    mul a5, a1, a3
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    mul a6, a0, a3
+; RV32-NEXT:    j .LBB50_6
+; RV32-NEXT:  .LBB50_5: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a4, a2, a0
+; RV32-NEXT:    mul a5, a3, a0
+; RV32-NEXT:    mulhu a6, a2, a1
+; RV32-NEXT:    add a4, a4, a5
+; RV32-NEXT:    mul a5, a3, a1
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    mul a6, a2, a1
+; RV32-NEXT:  .LBB50_6: # %overflow.res
+; RV32-NEXT:    add a6, a4, a6
+; RV32-NEXT:    sltu a4, a6, a4
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    snez a4, a4
+; RV32-NEXT:    andi a4, a4, 1
+; RV32-NEXT:    bnez a4, .LBB50_8
+; RV32-NEXT:  .LBB50_7: # %overflow.res
 ; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:  .LBB50_2: # %entry
+; RV32-NEXT:  .LBB50_8: # %overflow.res
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB50_9: # %overflow.no
+; RV32-NEXT:    j .LBB50_7
 ;
 ; RV64-LABEL: umulo.select.i64:
 ; RV64:       # %bb.0: # %entry
@@ -3651,7 +5053,11 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.select.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB50_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB50_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    snez a6, a3
@@ -3668,12 +5074,42 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    snez a6, a6
 ; RV32ZBA-NEXT:    or a5, a5, a6
 ; RV32ZBA-NEXT:    or a4, a5, a4
-; RV32ZBA-NEXT:    bnez a4, .LBB50_2
-; RV32ZBA-NEXT:  # %bb.1: # %entry
+; RV32ZBA-NEXT:    andi a4, a4, 1
+; RV32ZBA-NEXT:    beqz a4, .LBB50_7
+; RV32ZBA-NEXT:    j .LBB50_8
+; RV32ZBA-NEXT:  .LBB50_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB50_9
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a4, a0, a2
+; RV32ZBA-NEXT:    mul a5, a1, a2
+; RV32ZBA-NEXT:    mulhu a6, a0, a3
+; RV32ZBA-NEXT:    add a4, a4, a5
+; RV32ZBA-NEXT:    mul a5, a1, a3
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    mul a6, a0, a3
+; RV32ZBA-NEXT:    j .LBB50_6
+; RV32ZBA-NEXT:  .LBB50_5: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a4, a2, a0
+; RV32ZBA-NEXT:    mul a5, a3, a0
+; RV32ZBA-NEXT:    mulhu a6, a2, a1
+; RV32ZBA-NEXT:    add a4, a4, a5
+; RV32ZBA-NEXT:    mul a5, a3, a1
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    mul a6, a2, a1
+; RV32ZBA-NEXT:  .LBB50_6: # %overflow.res
+; RV32ZBA-NEXT:    add a6, a4, a6
+; RV32ZBA-NEXT:    sltu a4, a6, a4
+; RV32ZBA-NEXT:    add a4, a5, a4
+; RV32ZBA-NEXT:    snez a4, a4
+; RV32ZBA-NEXT:    andi a4, a4, 1
+; RV32ZBA-NEXT:    bnez a4, .LBB50_8
+; RV32ZBA-NEXT:  .LBB50_7: # %overflow.res
 ; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    mv a1, a3
-; RV32ZBA-NEXT:  .LBB50_2: # %entry
+; RV32ZBA-NEXT:  .LBB50_8: # %overflow.res
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB50_9: # %overflow.no
+; RV32ZBA-NEXT:    j .LBB50_7
 ;
 ; RV64ZBA-LABEL: umulo.select.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -3685,7 +5121,11 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.select.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB50_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB50_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    snez a6, a3
@@ -3702,6 +5142,36 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    snez a6, a6
 ; RV32ZICOND-NEXT:    or a5, a5, a6
 ; RV32ZICOND-NEXT:    or a4, a5, a4
+; RV32ZICOND-NEXT:    j .LBB50_8
+; RV32ZICOND-NEXT:  .LBB50_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB50_7
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a0, a2
+; RV32ZICOND-NEXT:    mul a5, a1, a2
+; RV32ZICOND-NEXT:    mulhu a6, a0, a3
+; RV32ZICOND-NEXT:    add a4, a4, a5
+; RV32ZICOND-NEXT:    mul a5, a1, a3
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    mul a6, a0, a3
+; RV32ZICOND-NEXT:    j .LBB50_6
+; RV32ZICOND-NEXT:  .LBB50_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a2, a0
+; RV32ZICOND-NEXT:    mul a5, a3, a0
+; RV32ZICOND-NEXT:    mulhu a6, a2, a1
+; RV32ZICOND-NEXT:    add a4, a4, a5
+; RV32ZICOND-NEXT:    mul a5, a3, a1
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    mul a6, a2, a1
+; RV32ZICOND-NEXT:  .LBB50_6: # %overflow.res
+; RV32ZICOND-NEXT:    add a6, a4, a6
+; RV32ZICOND-NEXT:    sltu a4, a6, a4
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    snez a4, a4
+; RV32ZICOND-NEXT:    j .LBB50_8
+; RV32ZICOND-NEXT:  .LBB50_7: # %overflow.no
+; RV32ZICOND-NEXT:    li a4, 0
+; RV32ZICOND-NEXT:  .LBB50_8: # %overflow.res
+; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -3726,7 +5196,11 @@ entry:
 
 define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.not.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB51_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB51_5
+; RV32-NEXT:  # %bb.2: # %overflow
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhu a6, a0, a2
@@ -3745,6 +5219,38 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB51_3: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB51_7
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a4, a0, a2
+; RV32-NEXT:    mul a2, a1, a2
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    mulhu a4, a0, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, a4, a1
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    j .LBB51_6
+; RV32-NEXT:  .LBB51_5: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a4, a2, a0
+; RV32-NEXT:    mul a0, a3, a0
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    mulhu a4, a2, a1
+; RV32-NEXT:    mul a3, a3, a1
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    mul a1, a2, a1
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:  .LBB51_6: # %overflow.no.rhs.only
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB51_7: # %overflow.no
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.not.i64:
 ; RV64:       # %bb.0: # %entry
@@ -3753,7 +5259,11 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.not.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB51_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB51_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhu a6, a0, a2
@@ -3772,6 +5282,38 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB51_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB51_7
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a4, a0, a2
+; RV32ZBA-NEXT:    mul a2, a1, a2
+; RV32ZBA-NEXT:    add a2, a4, a2
+; RV32ZBA-NEXT:    mulhu a4, a0, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    add a1, a4, a1
+; RV32ZBA-NEXT:    mul a0, a0, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    sltu a0, a0, a2
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    j .LBB51_6
+; RV32ZBA-NEXT:  .LBB51_5: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a4, a2, a0
+; RV32ZBA-NEXT:    mul a0, a3, a0
+; RV32ZBA-NEXT:    add a0, a4, a0
+; RV32ZBA-NEXT:    mulhu a4, a2, a1
+; RV32ZBA-NEXT:    mul a3, a3, a1
+; RV32ZBA-NEXT:    add a3, a4, a3
+; RV32ZBA-NEXT:    mul a1, a2, a1
+; RV32ZBA-NEXT:    add a1, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:  .LBB51_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    xori a0, a0, 1
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB51_7: # %overflow.no
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.not.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -3780,7 +5322,11 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.not.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB51_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB51_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
@@ -3799,6 +5345,38 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB51_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB51_7
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    add a2, a4, a2
+; RV32ZICOND-NEXT:    mulhu a4, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a1, a4, a1
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB51_6
+; RV32ZICOND-NEXT:  .LBB51_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    add a0, a4, a0
+; RV32ZICOND-NEXT:    mulhu a4, a2, a1
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    add a3, a4, a3
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:  .LBB51_6: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    xori a0, a0, 1
+; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB51_7: # %overflow.no
+; RV32ZICOND-NEXT:    li a0, 1
+; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.not.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -4656,7 +6234,13 @@ continue:
 
 define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.br.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a5, a0, 31
+; RV32-NEXT:    srai a4, a2, 31
+; RV32-NEXT:    beq a1, a5, .LBB61_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beq a3, a4, .LBB61_6
+; RV32-NEXT:  # %bb.2: # %overflow1
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a2, a1, a2
@@ -4684,13 +6268,133 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a4, a5, a4
 ; RV32-NEXT:    or a0, a4, a0
-; RV32-NEXT:    beqz a0, .LBB61_2
-; RV32-NEXT:  # %bb.1: # %overflow
-; RV32-NEXT:    li a0, 0
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB61_2: # %continue
+; RV32-NEXT:    j .LBB61_26
+; RV32-NEXT:  .LBB61_3: # %overflow.no.lhs
+; RV32-NEXT:    beq a3, a4, .LBB61_8
+; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a1, .LBB61_10
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    bgez a1, .LBB61_11
+; RV32-NEXT:    j .LBB61_12
+; RV32-NEXT:  .LBB61_6: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a3, .LBB61_14
+; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    bgez a3, .LBB61_15
+; RV32-NEXT:    j .LBB61_16
+; RV32-NEXT:  .LBB61_8: # %overflow.no
+; RV32-NEXT:  .LBB61_9: # %continue
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB61_10:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a5, a0
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a1, .LBB61_12
+; RV32-NEXT:  .LBB61_11: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB61_12: # %overflow.no.lhs.only
+; RV32-NEXT:    bltz a3, .LBB61_18
+; RV32-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    j .LBB61_19
+; RV32-NEXT:  .LBB61_14:
+; RV32-NEXT:    neg a4, a2
+; RV32-NEXT:    snez a5, a2
+; RV32-NEXT:    neg a6, a3
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    bltz a3, .LBB61_16
+; RV32-NEXT:  .LBB61_15: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a5, a3
+; RV32-NEXT:    mv a4, a2
+; RV32-NEXT:  .LBB61_16: # %overflow.no.rhs.only
+; RV32-NEXT:    bltz a1, .LBB61_22
+; RV32-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    j .LBB61_23
+; RV32-NEXT:  .LBB61_18:
+; RV32-NEXT:    neg a6, a2
+; RV32-NEXT:    snez a0, a2
+; RV32-NEXT:    neg a7, a3
+; RV32-NEXT:    sub a0, a7, a0
+; RV32-NEXT:  .LBB61_19: # %overflow.no.lhs.only
+; RV32-NEXT:    slti a1, a1, 0
+; RV32-NEXT:    slti a7, a3, 0
+; RV32-NEXT:    bltz a3, .LBB61_21
+; RV32-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a6, a2
+; RV32-NEXT:  .LBB61_21: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a2, a4, a6
+; RV32-NEXT:    mul a3, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a0
+; RV32-NEXT:    mulhu t0, a4, a0
+; RV32-NEXT:    mul a0, a4, a0
+; RV32-NEXT:    xor a1, a7, a1
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a3, a1
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    xor a4, a6, a3
+; RV32-NEXT:    sltu a2, a0, a2
+; RV32-NEXT:    add a4, a4, a1
+; RV32-NEXT:    xor a0, a0, a3
+; RV32-NEXT:    add a2, a5, a2
+; RV32-NEXT:    sltu a1, a4, a1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a2, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    j .LBB61_26
+; RV32-NEXT:  .LBB61_22:
+; RV32-NEXT:    neg a6, a0
+; RV32-NEXT:    snez a2, a0
+; RV32-NEXT:    neg a7, a1
+; RV32-NEXT:    sub a2, a7, a2
+; RV32-NEXT:  .LBB61_23: # %overflow.no.rhs.only
+; RV32-NEXT:    slti a3, a3, 0
+; RV32-NEXT:    slti a7, a1, 0
+; RV32-NEXT:    bltz a1, .LBB61_25
+; RV32-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32-NEXT:    mv a2, a1
+; RV32-NEXT:    mv a6, a0
+; RV32-NEXT:  .LBB61_25: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a0, a4, a6
+; RV32-NEXT:    mul a1, a5, a6
+; RV32-NEXT:    mul a6, a4, a6
+; RV32-NEXT:    mul a5, a5, a2
+; RV32-NEXT:    mulhu t0, a4, a2
+; RV32-NEXT:    mul a2, a4, a2
+; RV32-NEXT:    xor a3, a3, a7
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    neg a1, a3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    xor a4, a6, a1
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a4, a4, a3
+; RV32-NEXT:    xor a2, a2, a1
+; RV32-NEXT:    add a0, a5, a0
+; RV32-NEXT:    sltu a3, a4, a3
+; RV32-NEXT:    add a2, a2, a3
+; RV32-NEXT:    sltu a2, a2, a3
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    add a0, a0, a2
+; RV32-NEXT:  .LBB61_26: # %overflow.res
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB61_9
+; RV32-NEXT:  # %bb.27: # %overflow
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -4706,7 +6410,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a5, a0, 31
+; RV32ZBA-NEXT:    srai a4, a2, 31
+; RV32ZBA-NEXT:    beq a1, a5, .LBB61_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB61_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow1
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a2, a1, a2
@@ -4734,13 +6444,133 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a4, a5, a4
 ; RV32ZBA-NEXT:    or a0, a4, a0
-; RV32ZBA-NEXT:    beqz a0, .LBB61_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow
-; RV32ZBA-NEXT:    li a0, 0
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB61_2: # %continue
+; RV32ZBA-NEXT:    j .LBB61_26
+; RV32ZBA-NEXT:  .LBB61_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beq a3, a4, .LBB61_8
+; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB61_10
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    bgez a1, .LBB61_11
+; RV32ZBA-NEXT:    j .LBB61_12
+; RV32ZBA-NEXT:  .LBB61_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB61_14
+; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    bgez a3, .LBB61_15
+; RV32ZBA-NEXT:    j .LBB61_16
+; RV32ZBA-NEXT:  .LBB61_8: # %overflow.no
+; RV32ZBA-NEXT:  .LBB61_9: # %continue
 ; RV32ZBA-NEXT:    li a0, 1
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB61_10:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a5, a0
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a1, .LBB61_12
+; RV32ZBA-NEXT:  .LBB61_11: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB61_12: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    bltz a3, .LBB61_18
+; RV32ZBA-NEXT:  # %bb.13: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    j .LBB61_19
+; RV32ZBA-NEXT:  .LBB61_14:
+; RV32ZBA-NEXT:    neg a4, a2
+; RV32ZBA-NEXT:    snez a5, a2
+; RV32ZBA-NEXT:    neg a6, a3
+; RV32ZBA-NEXT:    sub a5, a6, a5
+; RV32ZBA-NEXT:    bltz a3, .LBB61_16
+; RV32ZBA-NEXT:  .LBB61_15: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a5, a3
+; RV32ZBA-NEXT:    mv a4, a2
+; RV32ZBA-NEXT:  .LBB61_16: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    bltz a1, .LBB61_22
+; RV32ZBA-NEXT:  # %bb.17: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    j .LBB61_23
+; RV32ZBA-NEXT:  .LBB61_18:
+; RV32ZBA-NEXT:    neg a6, a2
+; RV32ZBA-NEXT:    snez a0, a2
+; RV32ZBA-NEXT:    neg a7, a3
+; RV32ZBA-NEXT:    sub a0, a7, a0
+; RV32ZBA-NEXT:  .LBB61_19: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    slti a1, a1, 0
+; RV32ZBA-NEXT:    slti a7, a3, 0
+; RV32ZBA-NEXT:    bltz a3, .LBB61_21
+; RV32ZBA-NEXT:  # %bb.20: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mv a0, a3
+; RV32ZBA-NEXT:    mv a6, a2
+; RV32ZBA-NEXT:  .LBB61_21: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a2, a4, a6
+; RV32ZBA-NEXT:    mul a3, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a0
+; RV32ZBA-NEXT:    mulhu t0, a4, a0
+; RV32ZBA-NEXT:    mul a0, a4, a0
+; RV32ZBA-NEXT:    xor a1, a7, a1
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a3, a1
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    xor a4, a6, a3
+; RV32ZBA-NEXT:    sltu a2, a0, a2
+; RV32ZBA-NEXT:    add a4, a4, a1
+; RV32ZBA-NEXT:    xor a0, a0, a3
+; RV32ZBA-NEXT:    add a2, a5, a2
+; RV32ZBA-NEXT:    sltu a1, a4, a1
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a0, a1
+; RV32ZBA-NEXT:    xor a2, a2, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    j .LBB61_26
+; RV32ZBA-NEXT:  .LBB61_22:
+; RV32ZBA-NEXT:    neg a6, a0
+; RV32ZBA-NEXT:    snez a2, a0
+; RV32ZBA-NEXT:    neg a7, a1
+; RV32ZBA-NEXT:    sub a2, a7, a2
+; RV32ZBA-NEXT:  .LBB61_23: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    slti a3, a3, 0
+; RV32ZBA-NEXT:    slti a7, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB61_25
+; RV32ZBA-NEXT:  # %bb.24: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mv a2, a1
+; RV32ZBA-NEXT:    mv a6, a0
+; RV32ZBA-NEXT:  .LBB61_25: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a0, a4, a6
+; RV32ZBA-NEXT:    mul a1, a5, a6
+; RV32ZBA-NEXT:    mul a6, a4, a6
+; RV32ZBA-NEXT:    mul a5, a5, a2
+; RV32ZBA-NEXT:    mulhu t0, a4, a2
+; RV32ZBA-NEXT:    mul a2, a4, a2
+; RV32ZBA-NEXT:    xor a3, a3, a7
+; RV32ZBA-NEXT:    add a0, a0, a1
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    neg a1, a3
+; RV32ZBA-NEXT:    add a2, a0, a2
+; RV32ZBA-NEXT:    xor a4, a6, a1
+; RV32ZBA-NEXT:    sltu a0, a2, a0
+; RV32ZBA-NEXT:    add a4, a4, a3
+; RV32ZBA-NEXT:    xor a2, a2, a1
+; RV32ZBA-NEXT:    add a0, a5, a0
+; RV32ZBA-NEXT:    sltu a3, a4, a3
+; RV32ZBA-NEXT:    add a2, a2, a3
+; RV32ZBA-NEXT:    sltu a2, a2, a3
+; RV32ZBA-NEXT:    xor a0, a0, a1
+; RV32ZBA-NEXT:    add a0, a0, a2
+; RV32ZBA-NEXT:  .LBB61_26: # %overflow.res
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    beqz a0, .LBB61_9
+; RV32ZBA-NEXT:  # %bb.27: # %overflow
+; RV32ZBA-NEXT:    li a0, 0
+; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -4756,7 +6586,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a5, a0, 31
+; RV32ZICOND-NEXT:    srai a4, a2, 31
+; RV32ZICOND-NEXT:    beq a1, a5, .LBB61_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB61_5
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow1
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
@@ -4784,11 +6620,123 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
 ; RV32ZICOND-NEXT:    xor a4, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a4, a0
-; RV32ZICOND-NEXT:    beqz a0, .LBB61_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
+; RV32ZICOND-NEXT:    j .LBB61_6
+; RV32ZICOND-NEXT:  .LBB61_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beq a3, a4, .LBB61_8
+; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    slti a4, a1, 0
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    snez a6, a0
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    snez t0, a2
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a3, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a2, a2, t0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a1
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, t0, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a3
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    or a1, a6, a1
+; RV32ZICOND-NEXT:    or a3, a7, a3
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
+; RV32ZICOND-NEXT:    mul a6, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    mulhu a7, a0, a3
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    xor a3, a6, t0
+; RV32ZICOND-NEXT:    add a2, a5, a2
+; RV32ZICOND-NEXT:    add a1, a7, a1
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a2, a0, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    sltu a0, a0, a3
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB61_6
+; RV32ZICOND-NEXT:  .LBB61_5: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    slti a4, a3, 0
+; RV32ZICOND-NEXT:    neg a5, a2
+; RV32ZICOND-NEXT:    snez a6, a2
+; RV32ZICOND-NEXT:    neg a7, a3
+; RV32ZICOND-NEXT:    snez t0, a0
+; RV32ZICOND-NEXT:    sub a6, a7, a6
+; RV32ZICOND-NEXT:    neg a7, a1
+; RV32ZICOND-NEXT:    sub a7, a7, t0
+; RV32ZICOND-NEXT:    slti t0, a1, 0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
+; RV32ZICOND-NEXT:    or a5, a5, a2
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
+; RV32ZICOND-NEXT:    or a2, a5, a2
+; RV32ZICOND-NEXT:    neg a5, a0
+; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
+; RV32ZICOND-NEXT:    czero.nez a1, a1, t0
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    or a6, a6, a3
+; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
+; RV32ZICOND-NEXT:    xor a4, a4, t0
+; RV32ZICOND-NEXT:    or a5, a5, a0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    or a7, a7, a1
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
+; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
+; RV32ZICOND-NEXT:    neg t0, a4
+; RV32ZICOND-NEXT:    or a0, a5, a0
+; RV32ZICOND-NEXT:    or a3, a6, a3
+; RV32ZICOND-NEXT:    or a1, a7, a1
+; RV32ZICOND-NEXT:    mulhu a5, a2, a0
+; RV32ZICOND-NEXT:    mul a6, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    mulhu a7, a2, a1
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    xor a2, a6, t0
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    add a3, a7, a3
+; RV32ZICOND-NEXT:    add a2, a2, a4
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a2, a2, a4
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a1, t0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:    add a1, a1, a2
+; RV32ZICOND-NEXT:    sltu a1, a1, a2
+; RV32ZICOND-NEXT:    xor a0, a0, t0
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:  .LBB61_6: # %overflow.res
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    beqz a0, .LBB61_9
+; RV32ZICOND-NEXT:  # %bb.7: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB61_2: # %continue
+; RV32ZICOND-NEXT:  .LBB61_8: # %overflow.no
+; RV32ZICOND-NEXT:  .LBB61_9: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -4819,43 +6767,56 @@ continue:
 
 define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32-LABEL: smulo2.br.i64:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    li a2, -13
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    li a4, -1
-; RV32-NEXT:    mulhu a5, a0, a2
-; RV32-NEXT:    mul a6, a1, a2
-; RV32-NEXT:    mulhsu a2, a1, a2
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    sltu a6, a5, a6
-; RV32-NEXT:    sub a5, a5, a0
-; RV32-NEXT:    mulhsu a0, a4, a0
-; RV32-NEXT:    add a2, a2, a6
-; RV32-NEXT:    sltu a3, a5, a3
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    srai a3, a2, 31
-; RV32-NEXT:    srai a6, a0, 31
-; RV32-NEXT:    add a3, a3, a6
-; RV32-NEXT:    neg a6, a1
-; RV32-NEXT:    mulh a4, a1, a4
-; RV32-NEXT:    srai a5, a5, 31
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    srai a2, a0, 31
+; RV32-NEXT:    beq a1, a2, .LBB62_3
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    slti a2, a1, 0
+; RV32-NEXT:    bltz a1, .LBB62_5
+; RV32-NEXT:  # %bb.2: # %overflow.lhs
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    xori a3, a2, 1
+; RV32-NEXT:    bgez a1, .LBB62_6
+; RV32-NEXT:    j .LBB62_7
+; RV32-NEXT:  .LBB62_3: # %overflow.no.lhs
+; RV32-NEXT:  .LBB62_4: # %continue
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB62_5:
+; RV32-NEXT:    neg a4, a0
+; RV32-NEXT:    snez a3, a0
+; RV32-NEXT:    neg a5, a1
+; RV32-NEXT:    sub a5, a5, a3
+; RV32-NEXT:    xori a3, a2, 1
+; RV32-NEXT:    bltz a1, .LBB62_7
+; RV32-NEXT:  .LBB62_6: # %overflow.lhs
+; RV32-NEXT:    mv a5, a1
+; RV32-NEXT:    mv a4, a0
+; RV32-NEXT:  .LBB62_7: # %overflow.lhs
+; RV32-NEXT:    li a0, 13
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    mul a1, a4, a0
+; RV32-NEXT:    mulhu a4, a4, a0
+; RV32-NEXT:    mulhu a6, a5, a0
+; RV32-NEXT:    mul a0, a5, a0
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    xor a1, a1, a2
+; RV32-NEXT:    sltu a4, a0, a4
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    add a4, a6, a4
+; RV32-NEXT:    sltu a1, a1, a3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    xor a2, a4, a2
 ; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    sub a0, a0, a1
-; RV32-NEXT:    add a2, a3, a2
-; RV32-NEXT:    sltu a1, a0, a6
-; RV32-NEXT:    add a2, a4, a2
-; RV32-NEXT:    add a1, a2, a1
-; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    xor a0, a0, a5
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    beqz a0, .LBB62_2
-; RV32-NEXT:  # %bb.1: # %overflow
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB62_4
+; RV32-NEXT:  # %bb.8: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB62_2: # %continue
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -4872,43 +6833,58 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo2.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    li a2, -13
-; RV32ZBA-NEXT:    neg a3, a0
-; RV32ZBA-NEXT:    li a4, -1
-; RV32ZBA-NEXT:    mulhu a5, a0, a2
-; RV32ZBA-NEXT:    mul a6, a1, a2
-; RV32ZBA-NEXT:    mulhsu a2, a1, a2
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    sltu a6, a5, a6
-; RV32ZBA-NEXT:    sub a5, a5, a0
-; RV32ZBA-NEXT:    mulhsu a0, a4, a0
-; RV32ZBA-NEXT:    add a2, a2, a6
-; RV32ZBA-NEXT:    sltu a3, a5, a3
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    srai a2, a0, 31
+; RV32ZBA-NEXT:    beq a1, a2, .LBB62_3
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    slti a2, a1, 0
+; RV32ZBA-NEXT:    bltz a1, .LBB62_5
+; RV32ZBA-NEXT:  # %bb.2: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    xori a3, a2, 1
+; RV32ZBA-NEXT:    bgez a1, .LBB62_6
+; RV32ZBA-NEXT:    j .LBB62_7
+; RV32ZBA-NEXT:  .LBB62_3: # %overflow.no.lhs
+; RV32ZBA-NEXT:  .LBB62_4: # %continue
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB62_5:
+; RV32ZBA-NEXT:    neg a4, a0
+; RV32ZBA-NEXT:    snez a3, a0
+; RV32ZBA-NEXT:    neg a5, a1
+; RV32ZBA-NEXT:    sub a5, a5, a3
+; RV32ZBA-NEXT:    xori a3, a2, 1
+; RV32ZBA-NEXT:    bltz a1, .LBB62_7
+; RV32ZBA-NEXT:  .LBB62_6: # %overflow.lhs
+; RV32ZBA-NEXT:    mv a5, a1
+; RV32ZBA-NEXT:    mv a4, a0
+; RV32ZBA-NEXT:  .LBB62_7: # %overflow.lhs
+; RV32ZBA-NEXT:    sh1add a0, a4, a4
+; RV32ZBA-NEXT:    li a1, 13
+; RV32ZBA-NEXT:    sh1add a6, a5, a5
+; RV32ZBA-NEXT:    addi a2, a2, -1
+; RV32ZBA-NEXT:    sh2add a0, a0, a4
+; RV32ZBA-NEXT:    mulhu a4, a4, a1
+; RV32ZBA-NEXT:    sh2add a6, a6, a5
+; RV32ZBA-NEXT:    mulhu a1, a5, a1
+; RV32ZBA-NEXT:    add a6, a4, a6
+; RV32ZBA-NEXT:    xor a0, a0, a2
+; RV32ZBA-NEXT:    sltu a4, a6, a4
 ; RV32ZBA-NEXT:    add a0, a0, a3
-; RV32ZBA-NEXT:    srai a3, a2, 31
-; RV32ZBA-NEXT:    srai a6, a0, 31
-; RV32ZBA-NEXT:    add a3, a3, a6
-; RV32ZBA-NEXT:    neg a6, a1
-; RV32ZBA-NEXT:    mulh a4, a1, a4
-; RV32ZBA-NEXT:    srai a5, a5, 31
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    sltu a2, a0, a2
-; RV32ZBA-NEXT:    sub a0, a0, a1
-; RV32ZBA-NEXT:    add a2, a3, a2
-; RV32ZBA-NEXT:    sltu a1, a0, a6
-; RV32ZBA-NEXT:    add a2, a4, a2
-; RV32ZBA-NEXT:    add a1, a2, a1
-; RV32ZBA-NEXT:    xor a1, a1, a5
-; RV32ZBA-NEXT:    xor a0, a0, a5
-; RV32ZBA-NEXT:    or a0, a0, a1
-; RV32ZBA-NEXT:    beqz a0, .LBB62_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow
+; RV32ZBA-NEXT:    xor a5, a6, a2
+; RV32ZBA-NEXT:    add a1, a1, a4
+; RV32ZBA-NEXT:    sltu a0, a0, a3
+; RV32ZBA-NEXT:    add a5, a5, a0
+; RV32ZBA-NEXT:    sltu a0, a5, a0
+; RV32ZBA-NEXT:    xor a1, a1, a2
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    beqz a0, .LBB62_4
+; RV32ZBA-NEXT:  # %bb.8: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB62_2: # %continue
-; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -4925,43 +6901,54 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo2.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    li a2, -13
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    srai a2, a0, 31
+; RV32ZICOND-NEXT:    beq a1, a2, .LBB62_3
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    slti a2, a1, 0
 ; RV32ZICOND-NEXT:    neg a3, a0
-; RV32ZICOND-NEXT:    li a4, -1
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    mul a6, a1, a2
-; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    sltu a6, a5, a6
-; RV32ZICOND-NEXT:    sub a5, a5, a0
-; RV32ZICOND-NEXT:    mulhsu a0, a4, a0
-; RV32ZICOND-NEXT:    add a2, a2, a6
-; RV32ZICOND-NEXT:    sltu a3, a5, a3
-; RV32ZICOND-NEXT:    add a0, a0, a3
-; RV32ZICOND-NEXT:    srai a3, a2, 31
-; RV32ZICOND-NEXT:    srai a6, a0, 31
-; RV32ZICOND-NEXT:    add a3, a3, a6
-; RV32ZICOND-NEXT:    neg a6, a1
-; RV32ZICOND-NEXT:    mulh a4, a1, a4
-; RV32ZICOND-NEXT:    srai a5, a5, 31
-; RV32ZICOND-NEXT:    add a0, a2, a0
-; RV32ZICOND-NEXT:    sltu a2, a0, a2
-; RV32ZICOND-NEXT:    sub a0, a0, a1
-; RV32ZICOND-NEXT:    add a2, a3, a2
-; RV32ZICOND-NEXT:    sltu a1, a0, a6
-; RV32ZICOND-NEXT:    add a2, a4, a2
-; RV32ZICOND-NEXT:    add a1, a2, a1
-; RV32ZICOND-NEXT:    xor a1, a1, a5
-; RV32ZICOND-NEXT:    xor a0, a0, a5
-; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    beqz a0, .LBB62_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
-; RV32ZICOND-NEXT:    li a0, 0
-; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:    snez a4, a0
+; RV32ZICOND-NEXT:    neg a5, a1
+; RV32ZICOND-NEXT:    li a6, 13
+; RV32ZICOND-NEXT:    sub a5, a5, a4
+; RV32ZICOND-NEXT:    xori a4, a2, 1
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
+; RV32ZICOND-NEXT:    or a3, a3, a0
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a2
+; RV32ZICOND-NEXT:    or a5, a5, a1
+; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
+; RV32ZICOND-NEXT:    czero.eqz a5, a5, a2
+; RV32ZICOND-NEXT:    addi a2, a2, -1
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    or a1, a5, a1
+; RV32ZICOND-NEXT:    mul a3, a0, a6
+; RV32ZICOND-NEXT:    mulhu a0, a0, a6
+; RV32ZICOND-NEXT:    mulhu a5, a1, a6
+; RV32ZICOND-NEXT:    mul a1, a1, a6
+; RV32ZICOND-NEXT:    xor a3, a3, a2
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    add a3, a3, a4
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    sltu a3, a3, a4
+; RV32ZICOND-NEXT:    xor a1, a1, a2
+; RV32ZICOND-NEXT:    add a0, a5, a0
+; RV32ZICOND-NEXT:    add a1, a1, a3
+; RV32ZICOND-NEXT:    sltu a1, a1, a3
+; RV32ZICOND-NEXT:    xor a0, a0, a2
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    bnez a0, .LBB62_4
 ; RV32ZICOND-NEXT:  .LBB62_2: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB62_3: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    j .LBB62_2
+; RV32ZICOND-NEXT:  .LBB62_4: # %overflow
+; RV32ZICOND-NEXT:    li a0, 0
+; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -5079,7 +7066,11 @@ continue:
 
 define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.br.i64:
-; RV32:       # %bb.0: # %entry
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB64_4
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    beqz a3, .LBB64_6
+; RV32-NEXT:  # %bb.2: # %overflow1
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhu a6, a0, a2
@@ -5096,13 +7087,45 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    sltu a2, a4, a6
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a0, a2
-; RV32-NEXT:    beqz a0, .LBB64_2
-; RV32-NEXT:  # %bb.1: # %overflow
-; RV32-NEXT:    li a0, 0
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB64_2: # %continue
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    bnez a0, .LBB64_8
+; RV32-NEXT:  .LBB64_3: # %continue
 ; RV32-NEXT:    li a0, 1
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB64_4: # %overflow.no.lhs
+; RV32-NEXT:    beqz a3, .LBB64_9
+; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32-NEXT:    mulhu a4, a0, a2
+; RV32-NEXT:    mul a2, a1, a2
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    mulhu a4, a0, a3
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, a4, a1
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, a2, a0
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    j .LBB64_7
+; RV32-NEXT:  .LBB64_6: # %overflow.no.rhs.only
+; RV32-NEXT:    mulhu a4, a2, a0
+; RV32-NEXT:    mul a0, a3, a0
+; RV32-NEXT:    add a0, a4, a0
+; RV32-NEXT:    mulhu a4, a2, a1
+; RV32-NEXT:    mul a3, a3, a1
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    mul a1, a2, a1
+; RV32-NEXT:    add a1, a0, a1
+; RV32-NEXT:    sltu a0, a1, a0
+; RV32-NEXT:    add a0, a3, a0
+; RV32-NEXT:  .LBB64_7: # %overflow.no.rhs.only
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    andi a0, a0, 1
+; RV32-NEXT:    beqz a0, .LBB64_3
+; RV32-NEXT:  .LBB64_8: # %overflow
+; RV32-NEXT:    li a0, 0
+; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB64_9: # %overflow.no
+; RV32-NEXT:    j .LBB64_3
 ;
 ; RV64-LABEL: umulo.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -5116,7 +7139,11 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB64_4
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB64_6
+; RV32ZBA-NEXT:  # %bb.2: # %overflow1
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhu a6, a0, a2
@@ -5133,13 +7160,45 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    sltu a2, a4, a6
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    or a0, a0, a2
-; RV32ZBA-NEXT:    beqz a0, .LBB64_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow
-; RV32ZBA-NEXT:    li a0, 0
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB64_2: # %continue
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    bnez a0, .LBB64_8
+; RV32ZBA-NEXT:  .LBB64_3: # %continue
 ; RV32ZBA-NEXT:    li a0, 1
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB64_4: # %overflow.no.lhs
+; RV32ZBA-NEXT:    beqz a3, .LBB64_9
+; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZBA-NEXT:    mulhu a4, a0, a2
+; RV32ZBA-NEXT:    mul a2, a1, a2
+; RV32ZBA-NEXT:    add a2, a4, a2
+; RV32ZBA-NEXT:    mulhu a4, a0, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
+; RV32ZBA-NEXT:    add a1, a4, a1
+; RV32ZBA-NEXT:    mul a0, a0, a3
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    sltu a0, a0, a2
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    j .LBB64_7
+; RV32ZBA-NEXT:  .LBB64_6: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    mulhu a4, a2, a0
+; RV32ZBA-NEXT:    mul a0, a3, a0
+; RV32ZBA-NEXT:    add a0, a4, a0
+; RV32ZBA-NEXT:    mulhu a4, a2, a1
+; RV32ZBA-NEXT:    mul a3, a3, a1
+; RV32ZBA-NEXT:    add a3, a4, a3
+; RV32ZBA-NEXT:    mul a1, a2, a1
+; RV32ZBA-NEXT:    add a1, a0, a1
+; RV32ZBA-NEXT:    sltu a0, a1, a0
+; RV32ZBA-NEXT:    add a0, a3, a0
+; RV32ZBA-NEXT:  .LBB64_7: # %overflow.no.rhs.only
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    andi a0, a0, 1
+; RV32ZBA-NEXT:    beqz a0, .LBB64_3
+; RV32ZBA-NEXT:  .LBB64_8: # %overflow
+; RV32ZBA-NEXT:    li a0, 0
+; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB64_9: # %overflow.no
+; RV32ZBA-NEXT:    j .LBB64_3
 ;
 ; RV64ZBA-LABEL: umulo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -5153,7 +7212,11 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB64_4
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB64_6
+; RV32ZICOND-NEXT:  # %bb.2: # %overflow1
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
@@ -5170,13 +7233,45 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    sltu a2, a4, a6
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    beqz a0, .LBB64_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
-; RV32ZICOND-NEXT:    li a0, 0
-; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB64_2: # %continue
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    bnez a0, .LBB64_8
+; RV32ZICOND-NEXT:  .LBB64_3: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB64_4: # %overflow.no.lhs
+; RV32ZICOND-NEXT:    beqz a3, .LBB64_9
+; RV32ZICOND-NEXT:  # %bb.5: # %overflow.no.lhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a0, a2
+; RV32ZICOND-NEXT:    mul a2, a1, a2
+; RV32ZICOND-NEXT:    add a2, a4, a2
+; RV32ZICOND-NEXT:    mulhu a4, a0, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
+; RV32ZICOND-NEXT:    add a1, a4, a1
+; RV32ZICOND-NEXT:    mul a0, a0, a3
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a0, a0, a2
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    j .LBB64_7
+; RV32ZICOND-NEXT:  .LBB64_6: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    mulhu a4, a2, a0
+; RV32ZICOND-NEXT:    mul a0, a3, a0
+; RV32ZICOND-NEXT:    add a0, a4, a0
+; RV32ZICOND-NEXT:    mulhu a4, a2, a1
+; RV32ZICOND-NEXT:    mul a3, a3, a1
+; RV32ZICOND-NEXT:    add a3, a4, a3
+; RV32ZICOND-NEXT:    mul a1, a2, a1
+; RV32ZICOND-NEXT:    add a1, a0, a1
+; RV32ZICOND-NEXT:    sltu a0, a1, a0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:  .LBB64_7: # %overflow.no.rhs.only
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    andi a0, a0, 1
+; RV32ZICOND-NEXT:    beqz a0, .LBB64_3
+; RV32ZICOND-NEXT:  .LBB64_8: # %overflow
+; RV32ZICOND-NEXT:    li a0, 0
+; RV32ZICOND-NEXT:    ret
+; RV32ZICOND-NEXT:  .LBB64_9: # %overflow.no
+; RV32ZICOND-NEXT:    j .LBB64_3
 ;
 ; RV64ZICOND-LABEL: umulo.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -5203,16 +7298,13 @@ continue:
 
 define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV32-LABEL: umulo2.br.i64:
-; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    add a2, a0, a0
-; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a2, a1, a1
-; RV32-NEXT:    add a2, a2, a0
-; RV32-NEXT:    beq a2, a1, .LBB65_2
-; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    sltu a0, a2, a1
-; RV32-NEXT:  .LBB65_2: # %entry
-; RV32-NEXT:    beqz a0, .LBB65_4
+; RV32:       # %bb.0: # %overflow.entry
+; RV32-NEXT:    beqz a1, .LBB65_2
+; RV32-NEXT:  # %bb.1: # %overflow.lhs
+; RV32-NEXT:    srli a1, a1, 31
+; RV32-NEXT:  .LBB65_2: # %overflow.res
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    beqz a1, .LBB65_4
 ; RV32-NEXT:  # %bb.3: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -5232,16 +7324,13 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo2.br.i64:
-; RV32ZBA:       # %bb.0: # %entry
-; RV32ZBA-NEXT:    add a2, a0, a0
-; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a2, a1, a1
-; RV32ZBA-NEXT:    add a2, a2, a0
-; RV32ZBA-NEXT:    beq a2, a1, .LBB65_2
-; RV32ZBA-NEXT:  # %bb.1: # %entry
-; RV32ZBA-NEXT:    sltu a0, a2, a1
-; RV32ZBA-NEXT:  .LBB65_2: # %entry
-; RV32ZBA-NEXT:    beqz a0, .LBB65_4
+; RV32ZBA:       # %bb.0: # %overflow.entry
+; RV32ZBA-NEXT:    beqz a1, .LBB65_2
+; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZBA-NEXT:    srli a1, a1, 31
+; RV32ZBA-NEXT:  .LBB65_2: # %overflow.res
+; RV32ZBA-NEXT:    andi a1, a1, 1
+; RV32ZBA-NEXT:    beqz a1, .LBB65_4
 ; RV32ZBA-NEXT:  # %bb.3: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
@@ -5261,21 +7350,17 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo2.br.i64:
-; RV32ZICOND:       # %bb.0: # %entry
-; RV32ZICOND-NEXT:    add a2, a0, a0
-; RV32ZICOND-NEXT:    add a3, a1, a1
-; RV32ZICOND-NEXT:    sltu a0, a2, a0
-; RV32ZICOND-NEXT:    add a3, a3, a0
-; RV32ZICOND-NEXT:    xor a2, a3, a1
-; RV32ZICOND-NEXT:    sltu a1, a3, a1
-; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
-; RV32ZICOND-NEXT:    or a0, a0, a1
-; RV32ZICOND-NEXT:    beqz a0, .LBB65_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow
+; RV32ZICOND:       # %bb.0: # %overflow.entry
+; RV32ZICOND-NEXT:    beqz a1, .LBB65_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
+; RV32ZICOND-NEXT:    srli a1, a1, 31
+; RV32ZICOND-NEXT:  .LBB65_2: # %overflow.res
+; RV32ZICOND-NEXT:    andi a1, a1, 1
+; RV32ZICOND-NEXT:    beqz a1, .LBB65_4
+; RV32ZICOND-NEXT:  # %bb.3: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB65_2: # %continue
+; RV32ZICOND-NEXT:  .LBB65_4: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
index 1e5ab7922de08..ff846adf7e138 100644
--- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -5,93 +5,106 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:       ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %overflow.entry
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    ld [%fp+96], %l2
-; SPARC-NEXT:    mov %i3, %g2
-; SPARC-NEXT:    mov %i2, %g3
-; SPARC-NEXT:    umul %i1, %l2, %l0
-; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    ld [%fp+92], %l1
-; SPARC-NEXT:    umul %i0, %l2, %i3
-; SPARC-NEXT:    rd %y, %g4
-; SPARC-NEXT:    addcc %i3, %i2, %i2
-; SPARC-NEXT:    addxcc %g4, 0, %i3
-; SPARC-NEXT:    umul %i1, %l1, %g4
-; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addcc %g4, %i2, %l4
-; SPARC-NEXT:    addxcc %l3, 0, %i2
-; SPARC-NEXT:    addcc %i3, %i2, %i2
-; SPARC-NEXT:    addxcc %g0, 0, %i3
-; SPARC-NEXT:    umul %i0, %l1, %g4
+; SPARC-NEXT:    ld [%fp+96], %g3
+; SPARC-NEXT:    ld [%fp+92], %l0
+; SPARC-NEXT:    sra %i2, 31, %g2
+; SPARC-NEXT:    xor %i0, %g2, %g4
+; SPARC-NEXT:    xor %i1, %g2, %g2
+; SPARC-NEXT:    or %g2, %g4, %g2
+; SPARC-NEXT:    cmp %g2, 0
+; SPARC-NEXT:    sra %l0, 31, %g2
+; SPARC-NEXT:    xor %i4, %g2, %g4
+; SPARC-NEXT:    xor %i5, %g2, %g2
+; SPARC-NEXT:    be .LBB0_4
+; SPARC-NEXT:    or %g2, %g4, %g2
+; SPARC-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC-NEXT:    cmp %g2, 0
+; SPARC-NEXT:    be .LBB0_15
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.2: ! %overflow
+; SPARC-NEXT:    umul %i1, %g3, %l1
+; SPARC-NEXT:    rd %y, %g2
+; SPARC-NEXT:    umul %i0, %g3, %g4
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addcc %g4, %g2, %g2
+; SPARC-NEXT:    addxcc %l2, 0, %g4
+; SPARC-NEXT:    umul %i1, %l0, %l2
 ; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addcc %g4, %i2, %i2
+; SPARC-NEXT:    addcc %l2, %g2, %l2
+; SPARC-NEXT:    addxcc %l3, 0, %g2
+; SPARC-NEXT:    addcc %g4, %g2, %g2
+; SPARC-NEXT:    addxcc %g0, 0, %l3
+; SPARC-NEXT:    umul %i0, %l0, %g4
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g4, %g2, %g2
 ; SPARC-NEXT:    sra %i0, 31, %g4
-; SPARC-NEXT:    smul %l1, %g4, %l5
-; SPARC-NEXT:    umul %l2, %g4, %l6
+; SPARC-NEXT:    smul %l0, %g4, %l5
+; SPARC-NEXT:    umul %g3, %g4, %l6
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addxcc %l3, %i3, %l3
-; SPARC-NEXT:    add %l7, %l6, %i3
-; SPARC-NEXT:    add %i3, %l5, %l5
-; SPARC-NEXT:    addcc %i2, %l6, %l6
-; SPARC-NEXT:    umul %g2, %l2, %i3
-; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    addxcc %l3, %l5, %l3
-; SPARC-NEXT:    umul %g3, %l2, %l2
-; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addcc %l2, %i2, %i2
-; SPARC-NEXT:    addxcc %l5, 0, %l2
-; SPARC-NEXT:    umul %g2, %l1, %l5
+; SPARC-NEXT:    addxcc %l4, %l3, %l3
+; SPARC-NEXT:    add %l7, %l6, %l4
+; SPARC-NEXT:    add %l4, %l5, %l4
+; SPARC-NEXT:    addcc %g2, %l6, %l5
+; SPARC-NEXT:    umul %i3, %g3, %g2
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addxcc %l3, %l4, %l3
+; SPARC-NEXT:    umul %i2, %g3, %g3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g3, %l6, %g3
+; SPARC-NEXT:    addxcc %l4, 0, %l4
+; SPARC-NEXT:    umul %i3, %l0, %l6
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l5, %i2, %i2
-; SPARC-NEXT:    addxcc %l7, 0, %l5
-; SPARC-NEXT:    addcc %l2, %l5, %l2
-; SPARC-NEXT:    addxcc %g0, 0, %l5
-; SPARC-NEXT:    umul %g3, %l1, %l1
+; SPARC-NEXT:    addcc %l6, %g3, %g3
+; SPARC-NEXT:    addxcc %l7, 0, %l6
+; SPARC-NEXT:    addcc %l4, %l6, %l4
+; SPARC-NEXT:    addxcc %g0, 0, %l6
+; SPARC-NEXT:    umul %i2, %l0, %l0
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l1, %l2, %l1
-; SPARC-NEXT:    addxcc %l7, %l5, %l2
-; SPARC-NEXT:    addcc %l0, %l1, %l0
-; SPARC-NEXT:    addxcc %l4, %l2, %l1
-; SPARC-NEXT:    addxcc %l6, 0, %l2
-; SPARC-NEXT:    addxcc %l3, 0, %l3
-; SPARC-NEXT:    umul %g2, %i5, %l4
+; SPARC-NEXT:    addcc %l0, %l4, %l0
+; SPARC-NEXT:    addxcc %l7, %l6, %l4
+; SPARC-NEXT:    addcc %l1, %l0, %l0
+; SPARC-NEXT:    addxcc %l2, %l4, %l1
+; SPARC-NEXT:    addxcc %l5, 0, %l2
+; SPARC-NEXT:    umul %i2, %i5, %l4
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    sra %l3, 31, %l6
-; SPARC-NEXT:    umul %g3, %i5, %l7
-; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    addcc %l7, %l5, %l5
-; SPARC-NEXT:    addxcc %o0, 0, %l7
-; SPARC-NEXT:    umul %g2, %i4, %o0
+; SPARC-NEXT:    addxcc %l3, 0, %l3
+; SPARC-NEXT:    umul %i3, %i5, %l6
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    sra %l3, 31, %o0
+; SPARC-NEXT:    addcc %l4, %l7, %l4
+; SPARC-NEXT:    addxcc %l5, 0, %l5
+; SPARC-NEXT:    umul %i3, %i4, %l7
 ; SPARC-NEXT:    rd %y, %o1
-; SPARC-NEXT:    addcc %o0, %l5, %l5
-; SPARC-NEXT:    addxcc %o1, 0, %o0
-; SPARC-NEXT:    addcc %l7, %o0, %l7
-; SPARC-NEXT:    addxcc %g0, 0, %o0
-; SPARC-NEXT:    umul %g3, %i4, %o1
+; SPARC-NEXT:    addcc %l7, %l4, %l4
+; SPARC-NEXT:    addxcc %o1, 0, %l7
+; SPARC-NEXT:    addcc %l5, %l7, %l5
+; SPARC-NEXT:    addxcc %g0, 0, %l7
+; SPARC-NEXT:    umul %i2, %i4, %o1
 ; SPARC-NEXT:    rd %y, %o2
-; SPARC-NEXT:    addcc %o1, %l7, %l7
+; SPARC-NEXT:    addcc %o1, %l5, %l5
 ; SPARC-NEXT:    sra %i4, 31, %o1
-; SPARC-NEXT:    smul %o1, %g3, %g3
-; SPARC-NEXT:    umul %o1, %g2, %g2
+; SPARC-NEXT:    smul %o1, %i2, %i2
+; SPARC-NEXT:    umul %o1, %i3, %i3
 ; SPARC-NEXT:    rd %y, %o3
-; SPARC-NEXT:    addxcc %o2, %o0, %o0
-; SPARC-NEXT:    add %o3, %g3, %g3
-; SPARC-NEXT:    add %g3, %g2, %g3
-; SPARC-NEXT:    addcc %l7, %g2, %l7
-; SPARC-NEXT:    addxcc %o0, %g3, %o0
-; SPARC-NEXT:    addcc %l4, %l0, %g2
-; SPARC-NEXT:    addxcc %l5, %l1, %g3
-; SPARC-NEXT:    addxcc %l7, 0, %l0
-; SPARC-NEXT:    addxcc %o0, 0, %l1
+; SPARC-NEXT:    addxcc %o2, %l7, %l7
+; SPARC-NEXT:    add %o3, %i2, %i2
+; SPARC-NEXT:    add %i2, %i3, %i2
+; SPARC-NEXT:    addcc %l5, %i3, %i3
+; SPARC-NEXT:    addxcc %l7, %i2, %l5
+; SPARC-NEXT:    addcc %l6, %l0, %i2
+; SPARC-NEXT:    addxcc %l4, %l1, %l0
+; SPARC-NEXT:    addxcc %i3, 0, %i3
+; SPARC-NEXT:    addxcc %l5, 0, %l1
 ; SPARC-NEXT:    sra %l1, 31, %l4
-; SPARC-NEXT:    addcc %l2, %l0, %l0
+; SPARC-NEXT:    addcc %l2, %i3, %i3
 ; SPARC-NEXT:    addxcc %l3, %l1, %l1
-; SPARC-NEXT:    addxcc %l6, %l4, %l2
+; SPARC-NEXT:    addxcc %o0, %l4, %l2
 ; SPARC-NEXT:    smul %i4, %g4, %l3
 ; SPARC-NEXT:    umul %i5, %g4, %g4
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addxcc %l6, %l4, %l4
+; SPARC-NEXT:    addxcc %o0, %l4, %l4
 ; SPARC-NEXT:    add %l5, %g4, %l5
 ; SPARC-NEXT:    smul %o1, %i0, %l6
 ; SPARC-NEXT:    umul %o1, %i1, %l7
@@ -113,150 +126,1050 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-NEXT:    addxcc %l7, 0, %i5
 ; SPARC-NEXT:    addcc %l5, %i5, %i5
 ; SPARC-NEXT:    addxcc %g0, 0, %l5
-; SPARC-NEXT:    umul %i0, %i4, %i0
-; SPARC-NEXT:    rd %y, %i4
-; SPARC-NEXT:    addcc %i0, %i5, %i0
-; SPARC-NEXT:    addxcc %i4, %l5, %i4
-; SPARC-NEXT:    addcc %i0, %g4, %i0
-; SPARC-NEXT:    addxcc %i4, %l3, %i4
-; SPARC-NEXT:    addcc %l6, %l0, %i5
+; SPARC-NEXT:    umul %i0, %i4, %i4
+; SPARC-NEXT:    mov %l0, %i0
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    addcc %i4, %i5, %i4
+; SPARC-NEXT:    addxcc %l0, %l5, %i5
+; SPARC-NEXT:    addcc %i4, %g4, %i4
+; SPARC-NEXT:    addxcc %i5, %l3, %i5
+; SPARC-NEXT:    addcc %l6, %i3, %i3
 ; SPARC-NEXT:    addxcc %i1, %l1, %i1
-; SPARC-NEXT:    addxcc %i0, %l2, %i0
-; SPARC-NEXT:    addxcc %i4, %l4, %i4
-; SPARC-NEXT:    sra %g3, 31, %g4
-; SPARC-NEXT:    xor %i4, %g4, %i4
+; SPARC-NEXT:    addxcc %i4, %l2, %i4
+; SPARC-NEXT:    addxcc %i5, %l4, %i5
+; SPARC-NEXT:    sra %i0, 31, %g4
+; SPARC-NEXT:    xor %i5, %g4, %i5
 ; SPARC-NEXT:    xor %i1, %g4, %i1
-; SPARC-NEXT:    or %i1, %i4, %i1
-; SPARC-NEXT:    xor %i0, %g4, %i0
-; SPARC-NEXT:    xor %i5, %g4, %i4
-; SPARC-NEXT:    or %i4, %i0, %i0
-; SPARC-NEXT:    or %i0, %i1, %i0
+; SPARC-NEXT:    or %i1, %i5, %i1
+; SPARC-NEXT:    xor %i4, %g4, %i4
+; SPARC-NEXT:    xor %i3, %g4, %i3
+; SPARC-NEXT:    or %i3, %i4, %i3
+; SPARC-NEXT:    or %i3, %i1, %i1
+; SPARC-NEXT:    cmp %i1, 0
+; SPARC-NEXT:    bne .LBB0_110
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.3: ! %overflow
+; SPARC-NEXT:    ba .LBB0_111
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_4: ! %overflow.no.lhs
+; SPARC-NEXT:    cmp %g2, 0
+; SPARC-NEXT:    be .LBB0_25
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.5: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov 1, %g4
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_7
+; SPARC-NEXT:    mov %g4, %g2
+; SPARC-NEXT:  ! %bb.6: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %g2
+; SPARC-NEXT:  .LBB0_7: ! %overflow.no.lhs.only
+; SPARC-NEXT:    subcc %g0, %i3, %l4
+; SPARC-NEXT:    subxcc %g0, %i2, %l3
+; SPARC-NEXT:    subxcc %g0, %i1, %l1
+; SPARC-NEXT:    subxcc %g0, %i0, %l2
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_26
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.8: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i3, %l4
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_27
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_9: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_28
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_10: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i0, %l2
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_29
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_11: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_30
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_12: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i1, %l1
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_31
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_13: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_32
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_14: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i2, %l3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_33
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_34
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_15: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov 1, %g4
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_17
+; SPARC-NEXT:    mov %g4, %g2
+; SPARC-NEXT:  ! %bb.16: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %g2
+; SPARC-NEXT:  .LBB0_17: ! %overflow.no.rhs.only
+; SPARC-NEXT:    subcc %g0, %g3, %l4
+; SPARC-NEXT:    subxcc %g0, %l0, %l3
+; SPARC-NEXT:    subxcc %g0, %i5, %l1
+; SPARC-NEXT:    subxcc %g0, %i4, %l2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_44
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.18: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g3, %l4
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_45
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_19: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_46
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_20: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i4, %l2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_47
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_21: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_48
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_22: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i5, %l1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_49
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_23: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_50
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_24: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %l0, %l3
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_51
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_52
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_25: ! %overflow.no
+; SPARC-NEXT:    smul %g3, %i0, %g2
+; SPARC-NEXT:    umul %g3, %i1, %i0
+; SPARC-NEXT:    rd %y, %l1
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:    add %l1, %g2, %g2
+; SPARC-NEXT:    smul %l0, %i1, %i1
+; SPARC-NEXT:    smul %i5, %i2, %l1
+; SPARC-NEXT:    umul %i5, %i3, %i5
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    add %g2, %i1, %i1
+; SPARC-NEXT:    add %l2, %l1, %g2
+; SPARC-NEXT:    smul %i4, %i3, %i4
+; SPARC-NEXT:    add %g2, %i4, %i4
+; SPARC-NEXT:    addcc %i5, %i0, %i0
+; SPARC-NEXT:    umul %i3, %g3, %g2
+; SPARC-NEXT:    rd %y, %i5
+; SPARC-NEXT:    addxcc %i4, %i1, %i4
+; SPARC-NEXT:    umul %i2, %g3, %i1
+; SPARC-NEXT:    rd %y, %g3
+; SPARC-NEXT:    addcc %i1, %i5, %i1
+; SPARC-NEXT:    addxcc %g3, 0, %i5
+; SPARC-NEXT:    umul %i3, %l0, %i3
+; SPARC-NEXT:    rd %y, %l1
+; SPARC-NEXT:    addcc %i3, %i1, %g3
+; SPARC-NEXT:    addxcc %l1, 0, %i1
+; SPARC-NEXT:    addcc %i5, %i1, %i1
+; SPARC-NEXT:    addxcc %g0, 0, %i3
+; SPARC-NEXT:    umul %i2, %l0, %i2
+; SPARC-NEXT:    rd %y, %i5
+; SPARC-NEXT:    addcc %i2, %i1, %i1
+; SPARC-NEXT:    addxcc %i5, %i3, %i2
+; SPARC-NEXT:    addcc %i1, %i0, %i1
+; SPARC-NEXT:    ba .LBB0_112
+; SPARC-NEXT:    addxcc %i2, %i4, %i0
+; SPARC-NEXT:  .LBB0_26: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_9
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_27: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i2, %l3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_10
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_28: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_11
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_29: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i1, %l1
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_12
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_30: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_13
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_31: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i0, %l2
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_14
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_32: ! %overflow.no.lhs.only
 ; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bne .LBB0_2
+; SPARC-NEXT:    bl .LBB0_34
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_33: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i3, %l4
+; SPARC-NEXT:  .LBB0_34: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_36
+; SPARC-NEXT:    mov %g4, %i0
+; SPARC-NEXT:  ! %bb.35: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_36: ! %overflow.no.lhs.only
+; SPARC-NEXT:    subcc %g0, %g3, %l6
+; SPARC-NEXT:    subxcc %g0, %l0, %l5
+; SPARC-NEXT:    subxcc %g0, %i5, %i2
+; SPARC-NEXT:    subxcc %g0, %i4, %i1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_62
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.37: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g3, %l6
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_63
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_38: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_64
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_39: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i5, %i2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_65
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_40: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_66
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_41: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i4, %i1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_67
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_42: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_68
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_43: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %l0, %l5
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_69
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_70
 ; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.1: ! %start
-; SPARC-NEXT:    ba .LBB0_3
+; SPARC-NEXT:  .LBB0_44: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_19
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_45: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %l0, %l3
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_20
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_46: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_21
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_47: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i5, %l1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_22
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_48: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_23
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_49: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i4, %l2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_24
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_50: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_52
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_51: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g3, %l4
+; SPARC-NEXT:  .LBB0_52: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_54
+; SPARC-NEXT:    mov %g4, %i4
+; SPARC-NEXT:  ! %bb.53: ! %overflow.no.rhs.only
 ; SPARC-NEXT:    mov %g0, %i4
-; SPARC-NEXT:  .LBB0_2:
-; SPARC-NEXT:    mov 1, %i4
-; SPARC-NEXT:  .LBB0_3: ! %start
-; SPARC-NEXT:    mov %g3, %i0
+; SPARC-NEXT:  .LBB0_54: ! %overflow.no.rhs.only
+; SPARC-NEXT:    subcc %g0, %i3, %l5
+; SPARC-NEXT:    subxcc %g0, %i2, %l0
+; SPARC-NEXT:    subxcc %g0, %i1, %g3
+; SPARC-NEXT:    subxcc %g0, %i0, %i5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_85
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.55: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i3, %l5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_86
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_56: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_87
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_57: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i1, %g3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_88
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_58: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_89
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_59: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i0, %i5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_90
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_60: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_91
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_61: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i2, %l0
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_92
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ba .LBB0_93
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_62: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_38
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_63: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %l0, %l5
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_39
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_64: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_40
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_65: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i4, %i1
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_41
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_66: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_42
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_67: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %i5, %i2
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bge .LBB0_43
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_68: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bl .LBB0_70
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_69: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g3, %l6
+; SPARC-NEXT:  .LBB0_70: ! %overflow.no.lhs.only
+; SPARC-NEXT:    umul %l4, %l6, %i3
+; SPARC-NEXT:    rd %y, %i4
+; SPARC-NEXT:    umul %l3, %l6, %i5
+; SPARC-NEXT:    rd %y, %g3
+; SPARC-NEXT:    addcc %i5, %i4, %i4
+; SPARC-NEXT:    addxcc %g3, 0, %i5
+; SPARC-NEXT:    umul %l4, %l5, %g3
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    addcc %g3, %i4, %i4
+; SPARC-NEXT:    addxcc %l0, 0, %g3
+; SPARC-NEXT:    addcc %i5, %g3, %i5
+; SPARC-NEXT:    addxcc %g0, 0, %g3
+; SPARC-NEXT:    umul %l3, %l5, %l0
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l0, %i5, %i5
+; SPARC-NEXT:    smul %l6, %l2, %l0
+; SPARC-NEXT:    umul %l6, %l1, %l6
+; SPARC-NEXT:    rd %y, %o0
+; SPARC-NEXT:    addxcc %l7, %g3, %l7
+; SPARC-NEXT:    add %o0, %l0, %g3
+; SPARC-NEXT:    smul %l5, %l1, %l0
+; SPARC-NEXT:    add %g3, %l0, %l0
+; SPARC-NEXT:    addcc %i5, %l6, %g3
+; SPARC-NEXT:    umul %l4, %i2, %l5
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addxcc %l7, %l0, %i5
+; SPARC-NEXT:    umul %l3, %i2, %l0
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l0, %l6, %l0
+; SPARC-NEXT:    addxcc %l7, 0, %l6
+; SPARC-NEXT:    umul %l4, %i1, %l4
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l4, %l0, %l4
+; SPARC-NEXT:    addxcc %l7, 0, %l0
+; SPARC-NEXT:    addcc %l6, %l0, %l0
+; SPARC-NEXT:    addxcc %g0, 0, %l6
+; SPARC-NEXT:    umul %l3, %i1, %l3
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l3, %l0, %l0
+; SPARC-NEXT:    smul %i2, %l2, %l2
+; SPARC-NEXT:    umul %i2, %l1, %i2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addxcc %l7, %l6, %l6
+; SPARC-NEXT:    add %l3, %l2, %l2
+; SPARC-NEXT:    smul %i1, %l1, %i1
+; SPARC-NEXT:    add %l2, %i1, %i1
+; SPARC-NEXT:    addcc %l0, %i2, %l0
+; SPARC-NEXT:    addxcc %l6, %i1, %l1
+; SPARC-NEXT:    addcc %g3, %l5, %i1
+; SPARC-NEXT:    addxcc %i5, %l4, %i2
+; SPARC-NEXT:    cmp %i2, %i5
+; SPARC-NEXT:    bcs .LBB0_72
+; SPARC-NEXT:    mov %g4, %l2
+; SPARC-NEXT:  ! %bb.71: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %l2
+; SPARC-NEXT:  .LBB0_72: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i1, %g3
+; SPARC-NEXT:    bcs .LBB0_74
+; SPARC-NEXT:    mov %g4, %g3
+; SPARC-NEXT:  ! %bb.73: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %g3
+; SPARC-NEXT:  .LBB0_74: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i2, %i5
+; SPARC-NEXT:    be .LBB0_76
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.75: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %l2, %g3
+; SPARC-NEXT:  .LBB0_76: ! %overflow.no.lhs.only
+; SPARC-NEXT:    addcc %l0, %g3, %i5
+; SPARC-NEXT:    addxcc %l1, 0, %l0
+; SPARC-NEXT:    xor %i0, %g2, %i0
+; SPARC-NEXT:    sub %g0, %i0, %l1
+; SPARC-NEXT:    xor %i4, %l1, %i4
+; SPARC-NEXT:    xor %i3, %l1, %i3
+; SPARC-NEXT:    addcc %i3, %i0, %g2
+; SPARC-NEXT:    addxcc %i4, 0, %g3
+; SPARC-NEXT:    cmp %g2, %i0
+; SPARC-NEXT:    bcs .LBB0_78
+; SPARC-NEXT:    mov %g4, %i3
+; SPARC-NEXT:  ! %bb.77: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i3
+; SPARC-NEXT:  .LBB0_78: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %g3, 0
+; SPARC-NEXT:    be .LBB0_80
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.79: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i3
+; SPARC-NEXT:  .LBB0_80: ! %overflow.no.lhs.only
+; SPARC-NEXT:    xor %i1, %l1, %i0
+; SPARC-NEXT:    xor %i2, %l1, %i2
+; SPARC-NEXT:    addcc %i0, %i3, %i1
+; SPARC-NEXT:    addxcc %i2, 0, %i0
+; SPARC-NEXT:    cmp %i1, %i3
+; SPARC-NEXT:    bcs .LBB0_82
+; SPARC-NEXT:    mov %g4, %i2
+; SPARC-NEXT:  ! %bb.81: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i2
+; SPARC-NEXT:  .LBB0_82: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    be .LBB0_84
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.83: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %i2
+; SPARC-NEXT:  .LBB0_84: ! %overflow.no.lhs.only
+; SPARC-NEXT:    xor %i5, %l1, %i3
+; SPARC-NEXT:    xor %l0, %l1, %i4
+; SPARC-NEXT:    addcc %i3, %i2, %i2
+; SPARC-NEXT:    ba .LBB0_108
+; SPARC-NEXT:    addxcc %i4, 0, %i3
+; SPARC-NEXT:  .LBB0_85: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_56
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_86: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i2, %l0
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_57
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_87: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_58
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_88: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i0, %i5
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_59
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_89: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_60
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_90: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i1, %g3
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bge .LBB0_61
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_91: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bl .LBB0_93
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  .LBB0_92: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %i3, %l5
+; SPARC-NEXT:  .LBB0_93: ! %overflow.no.rhs.only
+; SPARC-NEXT:    umul %l4, %l5, %i0
+; SPARC-NEXT:    rd %y, %i1
+; SPARC-NEXT:    umul %l3, %l5, %i2
+; SPARC-NEXT:    rd %y, %i3
+; SPARC-NEXT:    addcc %i2, %i1, %i1
+; SPARC-NEXT:    addxcc %i3, 0, %i2
+; SPARC-NEXT:    umul %l4, %l0, %i3
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addcc %i3, %i1, %i1
+; SPARC-NEXT:    addxcc %l6, 0, %i3
+; SPARC-NEXT:    addcc %i2, %i3, %i2
+; SPARC-NEXT:    addxcc %g0, 0, %i3
+; SPARC-NEXT:    umul %l3, %l0, %l6
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l6, %i2, %i2
+; SPARC-NEXT:    smul %l5, %l2, %l6
+; SPARC-NEXT:    umul %l5, %l1, %l5
+; SPARC-NEXT:    rd %y, %o0
+; SPARC-NEXT:    addxcc %l7, %i3, %l7
+; SPARC-NEXT:    add %o0, %l6, %i3
+; SPARC-NEXT:    smul %l0, %l1, %l0
+; SPARC-NEXT:    add %i3, %l0, %l0
+; SPARC-NEXT:    addcc %i2, %l5, %i3
+; SPARC-NEXT:    umul %l4, %g3, %l5
+; SPARC-NEXT:    rd %y, %l6
+; SPARC-NEXT:    addxcc %l7, %l0, %i2
+; SPARC-NEXT:    umul %l3, %g3, %l0
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l0, %l6, %l0
+; SPARC-NEXT:    addxcc %l7, 0, %l6
+; SPARC-NEXT:    umul %l4, %i5, %l4
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l4, %l0, %l0
+; SPARC-NEXT:    addxcc %l7, 0, %l4
+; SPARC-NEXT:    addcc %l6, %l4, %l4
+; SPARC-NEXT:    addxcc %g0, 0, %l6
+; SPARC-NEXT:    umul %l3, %i5, %l3
+; SPARC-NEXT:    rd %y, %l7
+; SPARC-NEXT:    addcc %l3, %l4, %l3
+; SPARC-NEXT:    smul %g3, %l2, %l2
+; SPARC-NEXT:    umul %g3, %l1, %g3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addxcc %l7, %l6, %l6
+; SPARC-NEXT:    add %l4, %l2, %l2
+; SPARC-NEXT:    smul %i5, %l1, %i5
+; SPARC-NEXT:    add %l2, %i5, %i5
+; SPARC-NEXT:    addcc %l3, %g3, %g3
+; SPARC-NEXT:    addxcc %l6, %i5, %l1
+; SPARC-NEXT:    addcc %i3, %l5, %i5
+; SPARC-NEXT:    addxcc %i2, %l0, %l0
+; SPARC-NEXT:    cmp %l0, %i2
+; SPARC-NEXT:    bcs .LBB0_95
+; SPARC-NEXT:    mov %g4, %l2
+; SPARC-NEXT:  ! %bb.94: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l2
+; SPARC-NEXT:  .LBB0_95: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i5, %i3
+; SPARC-NEXT:    bcs .LBB0_97
+; SPARC-NEXT:    mov %g4, %i3
+; SPARC-NEXT:  ! %bb.96: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %i3
+; SPARC-NEXT:  .LBB0_97: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %l0, %i2
+; SPARC-NEXT:    be .LBB0_99
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.98: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %l2, %i3
+; SPARC-NEXT:  .LBB0_99: ! %overflow.no.rhs.only
+; SPARC-NEXT:    addcc %g3, %i3, %i2
+; SPARC-NEXT:    addxcc %l1, 0, %i3
+; SPARC-NEXT:    xor %g2, %i4, %l1
+; SPARC-NEXT:    sub %g0, %l1, %i4
+; SPARC-NEXT:    xor %i1, %i4, %i1
+; SPARC-NEXT:    xor %i0, %i4, %i0
+; SPARC-NEXT:    addcc %i0, %l1, %g2
+; SPARC-NEXT:    addxcc %i1, 0, %g3
+; SPARC-NEXT:    cmp %g2, %l1
+; SPARC-NEXT:    bcs .LBB0_101
+; SPARC-NEXT:    mov %g4, %l1
+; SPARC-NEXT:  ! %bb.100: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l1
+; SPARC-NEXT:  .LBB0_101: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %g3, 0
+; SPARC-NEXT:    be .LBB0_103
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.102: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l1
+; SPARC-NEXT:  .LBB0_103: ! %overflow.no.rhs.only
+; SPARC-NEXT:    xor %i5, %i4, %i0
+; SPARC-NEXT:    xor %l0, %i4, %i5
+; SPARC-NEXT:    addcc %i0, %l1, %i1
+; SPARC-NEXT:    addxcc %i5, 0, %i0
+; SPARC-NEXT:    cmp %i1, %l1
+; SPARC-NEXT:    bcs .LBB0_105
+; SPARC-NEXT:    mov %g4, %i5
+; SPARC-NEXT:  ! %bb.104: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %i5
+; SPARC-NEXT:  .LBB0_105: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    be .LBB0_107
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.106: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %i5
+; SPARC-NEXT:  .LBB0_107: ! %overflow.no.rhs.only
+; SPARC-NEXT:    xor %i2, %i4, %i2
+; SPARC-NEXT:    xor %i3, %i4, %i3
+; SPARC-NEXT:    addcc %i2, %i5, %i2
+; SPARC-NEXT:    addxcc %i3, 0, %i3
+; SPARC-NEXT:  .LBB0_108: ! %overflow.no.rhs.only
+; SPARC-NEXT:    or %i2, %i3, %i2
+; SPARC-NEXT:    cmp %i2, 0
+; SPARC-NEXT:    bne .LBB0_112
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.109: ! %overflow.no.rhs.only
+; SPARC-NEXT:    ba .LBB0_112
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_110:
+; SPARC-NEXT:    mov 1, %g4
+; SPARC-NEXT:  .LBB0_111: ! %overflow
+; SPARC-NEXT:    mov %i2, %i1
+; SPARC-NEXT:  .LBB0_112: ! %overflow.res
+; SPARC-NEXT:    and %g4, 1, %i4
+; SPARC-NEXT:    mov %g3, %i2
 ; SPARC-NEXT:    ret
-; SPARC-NEXT:    restore %g0, %g2, %o1
+; SPARC-NEXT:    restore %g0, %g2, %o3
 ;
 ; SPARC64-LABEL: muloti_test:
 ; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
-; SPARC64-NEXT:  ! %bb.0: ! %start
+; SPARC64-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    mov %i3, %i4
-; SPARC64-NEXT:    mov %i1, %i5
-; SPARC64-NEXT:    mov %i0, %l2
-; SPARC64-NEXT:    srax %i0, 63, %i3
-; SPARC64-NEXT:    mov %i3, %o0
+; SPARC64-NEXT:    mov %i1, %i4
+; SPARC64-NEXT:    srax %i1, 63, %i1
+; SPARC64-NEXT:    cmp %i0, %i1
+; SPARC64-NEXT:    be %xcc, .LBB0_3
+; SPARC64-NEXT:    srax %i3, 63, %i1
+; SPARC64-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-NEXT:    cmp %i2, %i1
+; SPARC64-NEXT:    be %xcc, .LBB0_5
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.2: ! %overflow
+; SPARC64-NEXT:    srax %i0, 63, %i5
+; SPARC64-NEXT:    mov %i5, %o0
 ; SPARC64-NEXT:    mov %i0, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o0, %l0
 ; SPARC64-NEXT:    mov %o1, %l1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i1, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %g0, %i0
-; SPARC64-NEXT:    add %l1, %o0, %l3
-; SPARC64-NEXT:    cmp %l3, %l1
-; SPARC64-NEXT:    movcs %xcc, 1, %i0
-; SPARC64-NEXT:    srl %i0, 0, %i0
-; SPARC64-NEXT:    add %l0, %i0, %l0
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    add %l1, %o0, %l2
+; SPARC64-NEXT:    cmp %l2, %l1
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:    add %l0, %i3, %l0
 ; SPARC64-NEXT:    srax %l0, 63, %l1
-; SPARC64-NEXT:    srax %i2, 63, %i4
+; SPARC64-NEXT:    srax %i2, 63, %i3
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i5, %o1
-; SPARC64-NEXT:    mov %i4, %o2
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i3, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %g0, %i4
 ; SPARC64-NEXT:    mov %g0, %g2
-; SPARC64-NEXT:    add %o1, %l3, %i0
-; SPARC64-NEXT:    cmp %i0, %o1
-; SPARC64-NEXT:    movcs %xcc, 1, %i5
-; SPARC64-NEXT:    srl %i5, 0, %i5
-; SPARC64-NEXT:    add %o0, %i5, %i5
-; SPARC64-NEXT:    srax %i5, 63, %g3
-; SPARC64-NEXT:    add %l1, %g3, %g3
-; SPARC64-NEXT:    add %l0, %i5, %i5
-; SPARC64-NEXT:    cmp %i5, %l0
+; SPARC64-NEXT:    add %o1, %l2, %g3
+; SPARC64-NEXT:    cmp %g3, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i4
+; SPARC64-NEXT:    srl %i4, 0, %i4
+; SPARC64-NEXT:    add %o0, %i4, %i4
+; SPARC64-NEXT:    srax %i4, 63, %g4
+; SPARC64-NEXT:    add %l1, %g4, %g4
+; SPARC64-NEXT:    add %l0, %i4, %i4
+; SPARC64-NEXT:    cmp %i4, %l0
 ; SPARC64-NEXT:    movcs %xcc, 1, %g2
 ; SPARC64-NEXT:    srl %g2, 0, %g2
-; SPARC64-NEXT:    add %g3, %g2, %l0
-; SPARC64-NEXT:    mov %i3, %o0
-; SPARC64-NEXT:    mov %l2, %o1
-; SPARC64-NEXT:    mov %i4, %o2
+; SPARC64-NEXT:    add %g4, %g2, %l0
+; SPARC64-NEXT:    mov %i5, %o0
+; SPARC64-NEXT:    mov %i0, %o1
+; SPARC64-NEXT:    mov %g3, %i0
+; SPARC64-NEXT:    mov %i3, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i2, %o3
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    add %o0, %l0, %i5
+; SPARC64-NEXT:    add %o1, %i4, %i4
+; SPARC64-NEXT:    cmp %i4, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:    add %i5, %i3, %i3
+; SPARC64-NEXT:    srax %i0, 63, %i5
+; SPARC64-NEXT:    xor %i3, %i5, %i3
+; SPARC64-NEXT:    xor %i4, %i5, %i4
+; SPARC64-NEXT:    ba .LBB0_7
+; SPARC64-NEXT:    or %i4, %i3, %i3
+; SPARC64-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-NEXT:    cmp %i2, %i1
+; SPARC64-NEXT:    be %xcc, .LBB0_8
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %g0, %i1
+; SPARC64-NEXT:    mov %g0, %l0
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    movrnz %i4, 1, %i1
+; SPARC64-NEXT:    srl %i1, 0, %i1
+; SPARC64-NEXT:    add %i0, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i0, %g3
+; SPARC64-NEXT:    movrlz %i0, %i1, %g3
+; SPARC64-NEXT:    sub %g0, %i4, %i1
+; SPARC64-NEXT:    mov %i4, %g4
+; SPARC64-NEXT:    movrlz %i0, %i1, %g4
+; SPARC64-NEXT:    movrlz %i0, 1, %i5
+; SPARC64-NEXT:    movrlz %i0, %g4, %i4
+; SPARC64-NEXT:    movrlz %i0, %g3, %i0
+; SPARC64-NEXT:    movrlz %i2, 1, %l0
+; SPARC64-NEXT:    sub %g0, %i3, %i1
+; SPARC64-NEXT:    mov %i3, %g3
+; SPARC64-NEXT:    movrlz %i2, %i1, %g3
+; SPARC64-NEXT:    movrnz %i3, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %i1
+; SPARC64-NEXT:    add %i2, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i2, %g2
+; SPARC64-NEXT:    movrlz %i2, %i1, %g2
+; SPARC64-NEXT:    movrlz %i2, %g3, %i3
+; SPARC64-NEXT:    movrlz %i2, %g2, %i2
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i1
+; SPARC64-NEXT:    mov %o1, %i3
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:    mov %g0, %i4
+; SPARC64-NEXT:    mov %g0, %g2
 ; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    add %i1, %o1, %g3
+; SPARC64-NEXT:    cmp %g3, %i1
+; SPARC64-NEXT:    movcs %xcc, 1, %i0
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    add %o0, %i0, %g4
+; SPARC64-NEXT:    xor %l0, %i5, %i0
+; SPARC64-NEXT:    and %i0, 1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i5
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    xor %i3, %i5, %i1
+; SPARC64-NEXT:    add %i1, %i0, %i1
+; SPARC64-NEXT:    cmp %i1, %i0
+; SPARC64-NEXT:    movcs %xcc, 1, %i4
+; SPARC64-NEXT:    ba .LBB0_6
+; SPARC64-NEXT:    srl %i4, 0, %i3
+; SPARC64-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %g0, %i1
+; SPARC64-NEXT:    mov %g0, %l0
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    movrnz %i3, 1, %i1
+; SPARC64-NEXT:    srl %i1, 0, %i1
+; SPARC64-NEXT:    add %i2, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i2, %g3
+; SPARC64-NEXT:    movrlz %i2, %i1, %g3
+; SPARC64-NEXT:    sub %g0, %i3, %i1
+; SPARC64-NEXT:    mov %i3, %g4
+; SPARC64-NEXT:    movrlz %i2, %i1, %g4
+; SPARC64-NEXT:    movrlz %i2, 1, %i5
+; SPARC64-NEXT:    movrlz %i2, %g4, %i3
+; SPARC64-NEXT:    movrlz %i2, %g3, %i2
+; SPARC64-NEXT:    movrlz %i0, 1, %l0
+; SPARC64-NEXT:    sub %g0, %i4, %i1
+; SPARC64-NEXT:    mov %i4, %g3
+; SPARC64-NEXT:    movrlz %i0, %i1, %g3
+; SPARC64-NEXT:    movrnz %i4, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %i1
+; SPARC64-NEXT:    add %i0, %i1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i1
+; SPARC64-NEXT:    mov %i0, %g2
+; SPARC64-NEXT:    movrlz %i0, %i1, %g2
+; SPARC64-NEXT:    movrlz %i0, %g3, %i4
+; SPARC64-NEXT:    movrlz %i0, %g2, %i0
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %o0, %i1
+; SPARC64-NEXT:    mov %o1, %i4
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i0, %o3
+; SPARC64-NEXT:    mov %g0, %i0
 ; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    add %o0, %l0, %i4
-; SPARC64-NEXT:    add %o1, %i5, %i5
-; SPARC64-NEXT:    cmp %i5, %o1
-; SPARC64-NEXT:    movcs %xcc, 1, %i2
-; SPARC64-NEXT:    srl %i2, 0, %i2
-; SPARC64-NEXT:    add %i4, %i2, %i2
-; SPARC64-NEXT:    srax %i0, 63, %i4
-; SPARC64-NEXT:    xor %i2, %i4, %i2
-; SPARC64-NEXT:    xor %i5, %i4, %i4
-; SPARC64-NEXT:    or %i4, %i2, %i2
-; SPARC64-NEXT:    movrnz %i2, 1, %i3
-; SPARC64-NEXT:    srl %i3, 0, %i2
+; SPARC64-NEXT:    mov %g0, %g2
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    add %i1, %o1, %g3
+; SPARC64-NEXT:    cmp %g3, %i1
+; SPARC64-NEXT:    movcs %xcc, 1, %i0
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    add %o0, %i0, %g4
+; SPARC64-NEXT:    xor %i5, %l0, %i0
+; SPARC64-NEXT:    and %i0, 1, %i1
+; SPARC64-NEXT:    sub %g0, %i1, %i5
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    xor %i4, %i5, %i1
+; SPARC64-NEXT:    add %i1, %i0, %i1
+; SPARC64-NEXT:    cmp %i1, %i0
+; SPARC64-NEXT:    movcs %xcc, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i3
+; SPARC64-NEXT:  .LBB0_6: ! %overflow.res
+; SPARC64-NEXT:    xor %g3, %i5, %i0
+; SPARC64-NEXT:    add %i0, %i3, %i0
+; SPARC64-NEXT:    cmp %i0, %i3
+; SPARC64-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-NEXT:    srl %g2, 0, %i3
+; SPARC64-NEXT:    xor %g4, %i5, %i4
+; SPARC64-NEXT:    add %i4, %i3, %i3
+; SPARC64-NEXT:  .LBB0_7: ! %overflow.res
+; SPARC64-NEXT:    ba .LBB0_9
+; SPARC64-NEXT:    movrnz %i3, 1, %i2
+; SPARC64-NEXT:  .LBB0_8: ! %overflow.no
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i2, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i0
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:  .LBB0_9: ! %overflow.res
+; SPARC64-NEXT:    and %i2, 1, %i2
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore
 ;
 ; SPARC64-VIS3-LABEL: muloti_test:
 ; SPARC64-VIS3:         .register %g2, #scratch
 ; SPARC64-VIS3-NEXT:    .register %g3, #scratch
-; SPARC64-VIS3-NEXT:  ! %bb.0: ! %start
+; SPARC64-VIS3-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-VIS3-NEXT:    save %sp, -128, %sp
-; SPARC64-VIS3-NEXT:    mov %g0, %i5
-; SPARC64-VIS3-NEXT:    umulxhi %i0, %i3, %i4
-; SPARC64-VIS3-NEXT:    srax %i0, 63, %g2
-; SPARC64-VIS3-NEXT:    mulx %g2, %i3, %g3
-; SPARC64-VIS3-NEXT:    add %i4, %g3, %i4
+; SPARC64-VIS3-NEXT:    srax %i1, 63, %i4
+; SPARC64-VIS3-NEXT:    cmp %i0, %i4
+; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_3
+; SPARC64-VIS3-NEXT:    srax %i3, 63, %i4
+; SPARC64-VIS3-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-VIS3-NEXT:    cmp %i2, %i4
+; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_5
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.2: ! %overflow
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    srax %i0, 63, %i5
+; SPARC64-VIS3-NEXT:    mulx %i5, %i3, %g2
+; SPARC64-VIS3-NEXT:    umulxhi %i0, %i3, %g3
+; SPARC64-VIS3-NEXT:    add %g3, %g2, %g2
 ; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
 ; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %g4
 ; SPARC64-VIS3-NEXT:    addcc %g4, %g3, %g3
-; SPARC64-VIS3-NEXT:    addxccc %i4, %g0, %g4
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %i4
-; SPARC64-VIS3-NEXT:    srax %i2, 63, %g5
-; SPARC64-VIS3-NEXT:    mulx %i1, %g5, %l0
-; SPARC64-VIS3-NEXT:    add %i4, %l0, %l0
-; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i4
-; SPARC64-VIS3-NEXT:    addcc %i4, %g3, %i4
-; SPARC64-VIS3-NEXT:    addxccc %l0, %g0, %g3
-; SPARC64-VIS3-NEXT:    srax %g3, 63, %l0
-; SPARC64-VIS3-NEXT:    addcc %g4, %g3, %g3
-; SPARC64-VIS3-NEXT:    srax %g4, 63, %g4
-; SPARC64-VIS3-NEXT:    addxccc %g4, %l0, %g4
-; SPARC64-VIS3-NEXT:    and %g5, %i0, %g5
-; SPARC64-VIS3-NEXT:    and %g2, %i2, %g2
-; SPARC64-VIS3-NEXT:    add %g2, %g5, %g2
-; SPARC64-VIS3-NEXT:    umulxhi %i0, %i2, %g5
-; SPARC64-VIS3-NEXT:    sub %g5, %g2, %g2
-; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
-; SPARC64-VIS3-NEXT:    addcc %i0, %g3, %i0
-; SPARC64-VIS3-NEXT:    addxccc %g2, %g4, %i2
-; SPARC64-VIS3-NEXT:    srax %i4, 63, %g2
+; SPARC64-VIS3-NEXT:    addxccc %g2, %g0, %g2
+; SPARC64-VIS3-NEXT:    srax %i2, 63, %g4
+; SPARC64-VIS3-NEXT:    mulx %i1, %g4, %g5
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %l0
+; SPARC64-VIS3-NEXT:    add %l0, %g5, %g5
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %l0
+; SPARC64-VIS3-NEXT:    addcc %l0, %g3, %g3
+; SPARC64-VIS3-NEXT:    addxccc %g5, %g0, %g5
+; SPARC64-VIS3-NEXT:    srax %g5, 63, %l0
+; SPARC64-VIS3-NEXT:    addcc %g2, %g5, %g5
+; SPARC64-VIS3-NEXT:    srax %g2, 63, %g2
+; SPARC64-VIS3-NEXT:    addxccc %g2, %l0, %g2
+; SPARC64-VIS3-NEXT:    and %g4, %i0, %g4
+; SPARC64-VIS3-NEXT:    and %i5, %i2, %i5
+; SPARC64-VIS3-NEXT:    add %i5, %g4, %i5
+; SPARC64-VIS3-NEXT:    umulxhi %i0, %i2, %g4
+; SPARC64-VIS3-NEXT:    sub %g4, %i5, %i5
+; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i2
+; SPARC64-VIS3-NEXT:    mov %g3, %i0
+; SPARC64-VIS3-NEXT:    addcc %i2, %g5, %i2
+; SPARC64-VIS3-NEXT:    addxccc %i5, %g2, %i5
+; SPARC64-VIS3-NEXT:    srax %g3, 63, %g2
+; SPARC64-VIS3-NEXT:    xor %i5, %g2, %i5
 ; SPARC64-VIS3-NEXT:    xor %i2, %g2, %i2
-; SPARC64-VIS3-NEXT:    xor %i0, %g2, %i0
-; SPARC64-VIS3-NEXT:    or %i0, %i2, %i0
-; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:    or %i2, %i5, %i2
+; SPARC64-VIS3-NEXT:    ba .LBB0_7
+; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-VIS3-NEXT:    cmp %i2, %i4
+; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_6
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mov %g0, %g3
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %g4
+; SPARC64-VIS3-NEXT:    mov %g0, %g5
+; SPARC64-VIS3-NEXT:    mov %g0, %l0
+; SPARC64-VIS3-NEXT:    mov %g0, %l1
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    sub %g0, %i1, %l2
+; SPARC64-VIS3-NEXT:    mov %i1, %l3
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %l3
+; SPARC64-VIS3-NEXT:    movrnz %i1, 1, %g3
+; SPARC64-VIS3-NEXT:    srl %g3, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i0, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i0, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l3, %i1
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %i0
+; SPARC64-VIS3-NEXT:    sub %g0, %i3, %g3
+; SPARC64-VIS3-NEXT:    mov %i3, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrnz %i3, 1, %g4
+; SPARC64-VIS3-NEXT:    srl %g4, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i2, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i2, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i2, 1, %g2
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %i3
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g4, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %g4
+; SPARC64-VIS3-NEXT:    add %g3, %g4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %g4
+; SPARC64-VIS3-NEXT:    add %g4, %i0, %i0
+; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i3
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i1
+; SPARC64-VIS3-NEXT:    add %g3, %i1, %i2
+; SPARC64-VIS3-NEXT:    cmp %i2, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g5
+; SPARC64-VIS3-NEXT:    srl %g5, 0, %i1
+; SPARC64-VIS3-NEXT:    add %i0, %i1, %g3
+; SPARC64-VIS3-NEXT:    xor %g2, %i5, %i0
+; SPARC64-VIS3-NEXT:    and %i0, 1, %i1
+; SPARC64-VIS3-NEXT:    sub %g0, %i1, %i5
+; SPARC64-VIS3-NEXT:    srl %i0, 0, %i0
+; SPARC64-VIS3-NEXT:    xor %i3, %i5, %i1
+; SPARC64-VIS3-NEXT:    add %i1, %i0, %i1
+; SPARC64-VIS3-NEXT:    cmp %i1, %i0
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l0
+; SPARC64-VIS3-NEXT:    srl %l0, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i2, %i5, %i0
+; SPARC64-VIS3-NEXT:    add %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    cmp %i0, %i3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l1
+; SPARC64-VIS3-NEXT:    srl %l1, 0, %i2
+; SPARC64-VIS3-NEXT:    xor %g3, %i5, %i3
+; SPARC64-VIS3-NEXT:    add %i3, %i2, %i2
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mov %g0, %g3
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %g4
+; SPARC64-VIS3-NEXT:    mov %g0, %g5
+; SPARC64-VIS3-NEXT:    mov %g0, %l0
+; SPARC64-VIS3-NEXT:    mov %g0, %l1
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    sub %g0, %i3, %l2
+; SPARC64-VIS3-NEXT:    mov %i3, %l3
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %l3
+; SPARC64-VIS3-NEXT:    movrnz %i3, 1, %g3
+; SPARC64-VIS3-NEXT:    srl %g3, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i2, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i2, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i2, 1, %i5
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l3, %i3
+; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %i2
+; SPARC64-VIS3-NEXT:    sub %g0, %i1, %g3
+; SPARC64-VIS3-NEXT:    mov %i1, %l2
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %l2
+; SPARC64-VIS3-NEXT:    movrnz %i1, 1, %g4
+; SPARC64-VIS3-NEXT:    srl %g4, 0, %g3
+; SPARC64-VIS3-NEXT:    add %i0, %g3, %g3
+; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
+; SPARC64-VIS3-NEXT:    mov %i0, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %g4
+; SPARC64-VIS3-NEXT:    movrlz %i0, 1, %g2
+; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %i1
+; SPARC64-VIS3-NEXT:    movrlz %i0, %g4, %i0
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i1, %g3
+; SPARC64-VIS3-NEXT:    mulx %i2, %i1, %g4
+; SPARC64-VIS3-NEXT:    add %g3, %g4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i2, %i0, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i0, %g4
+; SPARC64-VIS3-NEXT:    add %g4, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i3, %i1, %i1
+; SPARC64-VIS3-NEXT:    mulx %i3, %i0, %i0
+; SPARC64-VIS3-NEXT:    add %g3, %i0, %i0
+; SPARC64-VIS3-NEXT:    cmp %i0, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g5
+; SPARC64-VIS3-NEXT:    srl %g5, 0, %i3
+; SPARC64-VIS3-NEXT:    add %i2, %i3, %i2
+; SPARC64-VIS3-NEXT:    xor %i5, %g2, %i3
+; SPARC64-VIS3-NEXT:    and %i3, 1, %i5
+; SPARC64-VIS3-NEXT:    sub %g0, %i5, %i5
+; SPARC64-VIS3-NEXT:    srl %i3, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i1, %i5, %i1
+; SPARC64-VIS3-NEXT:    add %i1, %i3, %i1
+; SPARC64-VIS3-NEXT:    cmp %i1, %i3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l0
+; SPARC64-VIS3-NEXT:    srl %l0, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i0, %i5, %i0
+; SPARC64-VIS3-NEXT:    add %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    cmp %i0, %i3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l1
+; SPARC64-VIS3-NEXT:    srl %l1, 0, %i3
+; SPARC64-VIS3-NEXT:    xor %i2, %i5, %i2
+; SPARC64-VIS3-NEXT:    add %i2, %i3, %i2
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_6: ! %overflow.no
+; SPARC64-VIS3-NEXT:    mov %g0, %i4
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %i5
+; SPARC64-VIS3-NEXT:    add %i5, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    add %i2, %i0, %i0
+; SPARC64-VIS3-NEXT:  .LBB0_7: ! %overflow.res
 ; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i1
-; SPARC64-VIS3-NEXT:    srl %i5, 0, %i2
+; SPARC64-VIS3-NEXT:  .LBB0_8: ! %overflow.res
+; SPARC64-VIS3-NEXT:    and %i4, 1, %i2
 ; SPARC64-VIS3-NEXT:    ret
-; SPARC64-VIS3-NEXT:    restore %g0, %i4, %o0
+; SPARC64-VIS3-NEXT:    restore
 start:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 6d197c88bfecd..4533523f97d74 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -5,207 +5,470 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:       ! %bb.0: ! %start
+; SPARC:       ! %bb.0: ! %overflow.entry
 ; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:    ld [%fp+96], %l1
+; SPARC-NEXT:    ld [%fp+92], %g4
+; SPARC-NEXT:    or %i1, %i0, %l0
+; SPARC-NEXT:    cmp %l0, 0
 ; SPARC-NEXT:    mov %i3, %g2
-; SPARC-NEXT:    mov %i2, %g4
-; SPARC-NEXT:    umul %i2, %i5, %i2
+; SPARC-NEXT:    be .LBB0_33
+; SPARC-NEXT:    mov %i2, %g3
+; SPARC-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC-NEXT:    or %i5, %i4, %l2
+; SPARC-NEXT:    cmp %l2, 0
+; SPARC-NEXT:    be .LBB0_40
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.2: ! %overflow
+; SPARC-NEXT:    umul %g3, %i5, %i2
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    ld [%fp+92], %l4
-; SPARC-NEXT:    umul %i4, %i3, %i3
-; SPARC-NEXT:    rd %y, %o1
-; SPARC-NEXT:    ld [%fp+96], %g3
-; SPARC-NEXT:    umul %i5, %g2, %l3
+; SPARC-NEXT:    umul %i4, %g2, %i3
+; SPARC-NEXT:    rd %y, %o2
+; SPARC-NEXT:    umul %i5, %g2, %l5
 ; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    umul %l4, %i1, %l2
-; SPARC-NEXT:    rd %y, %l1
+; SPARC-NEXT:    umul %g4, %i1, %l4
+; SPARC-NEXT:    rd %y, %l3
 ; SPARC-NEXT:    add %i3, %i2, %i2
-; SPARC-NEXT:    umul %i0, %g3, %i3
+; SPARC-NEXT:    umul %i0, %l1, %i3
 ; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    add %o0, %i2, %o2
-; SPARC-NEXT:    umul %i1, %g3, %i2
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    add %i3, %l2, %i3
-; SPARC-NEXT:    add %l0, %i3, %l2
-; SPARC-NEXT:    addcc %i2, %l3, %l3
-; SPARC-NEXT:    umul %g2, %g3, %i3
+; SPARC-NEXT:    add %o0, %i2, %o1
+; SPARC-NEXT:    umul %i1, %l1, %i1
+; SPARC-NEXT:    rd %y, %i5
+; SPARC-NEXT:    add %i3, %l4, %i2
+; SPARC-NEXT:    add %i5, %i2, %l4
+; SPARC-NEXT:    addcc %i1, %l5, %i1
+; SPARC-NEXT:    umul %g2, %l1, %i3
 ; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    addxcc %l2, %o2, %o4
-; SPARC-NEXT:    umul %g4, %g3, %g3
+; SPARC-NEXT:    addxcc %l4, %o1, %o4
+; SPARC-NEXT:    umul %g3, %l1, %l1
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addcc %g3, %i2, %i2
-; SPARC-NEXT:    addxcc %l5, 0, %g3
-; SPARC-NEXT:    umul %g2, %l4, %g2
+; SPARC-NEXT:    addcc %l1, %i2, %i2
+; SPARC-NEXT:    addxcc %l5, 0, %l1
+; SPARC-NEXT:    umul %g2, %g4, %g2
 ; SPARC-NEXT:    rd %y, %l5
 ; SPARC-NEXT:    addcc %g2, %i2, %i2
 ; SPARC-NEXT:    addxcc %l5, 0, %g2
-; SPARC-NEXT:    addcc %g3, %g2, %g2
-; SPARC-NEXT:    addxcc %g0, 0, %g3
-; SPARC-NEXT:    umul %g4, %l4, %l5
+; SPARC-NEXT:    addcc %l1, %g2, %g2
+; SPARC-NEXT:    addxcc %g0, 0, %l1
+; SPARC-NEXT:    umul %g3, %g4, %l5
 ; SPARC-NEXT:    rd %y, %o3
 ; SPARC-NEXT:    addcc %l5, %g2, %l5
-; SPARC-NEXT:    addxcc %o3, %g3, %o3
-; SPARC-NEXT:    addcc %l5, %l3, %g2
-; SPARC-NEXT:    addxcc %o3, %o4, %g3
-; SPARC-NEXT:    mov 1, %l3
-; SPARC-NEXT:    cmp %g3, %o3
-; SPARC-NEXT:    bcs .LBB0_2
-; SPARC-NEXT:    mov %l3, %o4
-; SPARC-NEXT:  ! %bb.1: ! %start
-; SPARC-NEXT:    mov %g0, %o4
-; SPARC-NEXT:  .LBB0_2: ! %start
-; SPARC-NEXT:    cmp %g2, %l5
+; SPARC-NEXT:    addxcc %o3, %l1, %o3
+; SPARC-NEXT:    addcc %l5, %i1, %i1
+; SPARC-NEXT:    addxcc %o3, %o4, %g2
+; SPARC-NEXT:    mov 1, %l1
+; SPARC-NEXT:    cmp %g2, %o3
 ; SPARC-NEXT:    bcs .LBB0_4
-; SPARC-NEXT:    mov %l3, %l5
-; SPARC-NEXT:  ! %bb.3: ! %start
+; SPARC-NEXT:    mov %l1, %o4
+; SPARC-NEXT:  ! %bb.3: ! %overflow
+; SPARC-NEXT:    mov %g0, %o4
+; SPARC-NEXT:  .LBB0_4: ! %overflow
+; SPARC-NEXT:    cmp %i1, %l5
+; SPARC-NEXT:    bcs .LBB0_6
+; SPARC-NEXT:    mov %l1, %l5
+; SPARC-NEXT:  ! %bb.5: ! %overflow
 ; SPARC-NEXT:    mov %g0, %l5
-; SPARC-NEXT:  .LBB0_4: ! %start
-; SPARC-NEXT:    cmp %g3, %o3
-; SPARC-NEXT:    be .LBB0_6
+; SPARC-NEXT:  .LBB0_6: ! %overflow
+; SPARC-NEXT:    cmp %g2, %o3
+; SPARC-NEXT:    be .LBB0_8
 ; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.5: ! %start
+; SPARC-NEXT:  ! %bb.7: ! %overflow
 ; SPARC-NEXT:    mov %o4, %l5
-; SPARC-NEXT:  .LBB0_6: ! %start
-; SPARC-NEXT:    cmp %g4, 0
-; SPARC-NEXT:    bne .LBB0_8
-; SPARC-NEXT:    mov %l3, %o3
-; SPARC-NEXT:  ! %bb.7: ! %start
-; SPARC-NEXT:    mov %g0, %o3
-; SPARC-NEXT:  .LBB0_8: ! %start
+; SPARC-NEXT:  .LBB0_8: ! %overflow
 ; SPARC-NEXT:    cmp %i4, 0
 ; SPARC-NEXT:    bne .LBB0_10
-; SPARC-NEXT:    mov %l3, %o4
-; SPARC-NEXT:  ! %bb.9: ! %start
-; SPARC-NEXT:    mov %g0, %o4
-; SPARC-NEXT:  .LBB0_10: ! %start
-; SPARC-NEXT:    cmp %o1, 0
+; SPARC-NEXT:    mov %l1, %o3
+; SPARC-NEXT:  ! %bb.9: ! %overflow
+; SPARC-NEXT:    mov %g0, %o3
+; SPARC-NEXT:  .LBB0_10: ! %overflow
+; SPARC-NEXT:    cmp %g3, 0
 ; SPARC-NEXT:    bne .LBB0_12
-; SPARC-NEXT:    mov %l3, %o1
-; SPARC-NEXT:  ! %bb.11: ! %start
-; SPARC-NEXT:    mov %g0, %o1
-; SPARC-NEXT:  .LBB0_12: ! %start
-; SPARC-NEXT:    cmp %l7, 0
+; SPARC-NEXT:    mov %l1, %o4
+; SPARC-NEXT:  ! %bb.11: ! %overflow
+; SPARC-NEXT:    mov %g0, %o4
+; SPARC-NEXT:  .LBB0_12: ! %overflow
+; SPARC-NEXT:    cmp %o2, 0
 ; SPARC-NEXT:    bne .LBB0_14
-; SPARC-NEXT:    mov %l3, %l7
-; SPARC-NEXT:  ! %bb.13: ! %start
-; SPARC-NEXT:    mov %g0, %l7
-; SPARC-NEXT:  .LBB0_14: ! %start
-; SPARC-NEXT:    cmp %o2, %o0
-; SPARC-NEXT:    bcs .LBB0_16
-; SPARC-NEXT:    mov %l3, %g4
-; SPARC-NEXT:  ! %bb.15: ! %start
-; SPARC-NEXT:    mov %g0, %g4
-; SPARC-NEXT:  .LBB0_16: ! %start
-; SPARC-NEXT:    cmp %l4, 0
-; SPARC-NEXT:    bne .LBB0_18
-; SPARC-NEXT:    mov %l3, %l4
-; SPARC-NEXT:  ! %bb.17: ! %start
-; SPARC-NEXT:    mov %g0, %l4
-; SPARC-NEXT:  .LBB0_18: ! %start
+; SPARC-NEXT:    mov %l1, %o2
+; SPARC-NEXT:  ! %bb.13: ! %overflow
+; SPARC-NEXT:    mov %g0, %o2
+; SPARC-NEXT:  .LBB0_14: ! %overflow
+; SPARC-NEXT:    cmp %l7, 0
+; SPARC-NEXT:    bne .LBB0_16
+; SPARC-NEXT:    mov %l1, %g3
+; SPARC-NEXT:  ! %bb.15: ! %overflow
+; SPARC-NEXT:    mov %g0, %g3
+; SPARC-NEXT:  .LBB0_16: ! %overflow
+; SPARC-NEXT:    cmp %o1, %o0
+; SPARC-NEXT:    bcs .LBB0_18
+; SPARC-NEXT:    mov %l1, %i4
+; SPARC-NEXT:  ! %bb.17: ! %overflow
+; SPARC-NEXT:    mov %g0, %i4
+; SPARC-NEXT:  .LBB0_18: ! %overflow
 ; SPARC-NEXT:    cmp %i0, 0
 ; SPARC-NEXT:    bne .LBB0_20
-; SPARC-NEXT:    mov %l3, %o0
-; SPARC-NEXT:  ! %bb.19: ! %start
-; SPARC-NEXT:    mov %g0, %o0
-; SPARC-NEXT:  .LBB0_20: ! %start
-; SPARC-NEXT:    cmp %l6, 0
+; SPARC-NEXT:    mov %l1, %i0
+; SPARC-NEXT:  ! %bb.19: ! %overflow
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_20: ! %overflow
+; SPARC-NEXT:    cmp %g4, 0
 ; SPARC-NEXT:    bne .LBB0_22
-; SPARC-NEXT:    mov %l3, %l6
-; SPARC-NEXT:  ! %bb.21: ! %start
-; SPARC-NEXT:    mov %g0, %l6
-; SPARC-NEXT:  .LBB0_22: ! %start
-; SPARC-NEXT:    and %o4, %o3, %o2
-; SPARC-NEXT:    cmp %l1, 0
-; SPARC-NEXT:    and %o0, %l4, %l4
+; SPARC-NEXT:    mov %l1, %l7
+; SPARC-NEXT:  ! %bb.21: ! %overflow
+; SPARC-NEXT:    mov %g0, %l7
+; SPARC-NEXT:  .LBB0_22: ! %overflow
+; SPARC-NEXT:    cmp %l6, 0
 ; SPARC-NEXT:    bne .LBB0_24
-; SPARC-NEXT:    mov %l3, %l1
-; SPARC-NEXT:  ! %bb.23: ! %start
-; SPARC-NEXT:    mov %g0, %l1
-; SPARC-NEXT:  .LBB0_24: ! %start
-; SPARC-NEXT:    or %o2, %o1, %o0
-; SPARC-NEXT:    cmp %l2, %l0
-; SPARC-NEXT:    or %l4, %l6, %l4
-; SPARC-NEXT:    bcs .LBB0_26
-; SPARC-NEXT:    mov %l3, %l0
-; SPARC-NEXT:  ! %bb.25: ! %start
-; SPARC-NEXT:    mov %g0, %l0
-; SPARC-NEXT:  .LBB0_26: ! %start
-; SPARC-NEXT:    or %o0, %l7, %l2
-; SPARC-NEXT:    or %i5, %i4, %i4
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    or %l4, %l1, %l1
-; SPARC-NEXT:    bne .LBB0_28
-; SPARC-NEXT:    mov %l3, %i4
-; SPARC-NEXT:  ! %bb.27: ! %start
-; SPARC-NEXT:    mov %g0, %i4
-; SPARC-NEXT:  .LBB0_28: ! %start
-; SPARC-NEXT:    or %l2, %g4, %i5
-; SPARC-NEXT:    or %i1, %i0, %i0
-; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    mov %l1, %g4
+; SPARC-NEXT:  ! %bb.23: ! %overflow
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_24: ! %overflow
+; SPARC-NEXT:    and %o3, %o4, %l6
+; SPARC-NEXT:    cmp %l3, 0
+; SPARC-NEXT:    and %i0, %l7, %l7
+; SPARC-NEXT:    bne .LBB0_26
+; SPARC-NEXT:    mov %l1, %i0
+; SPARC-NEXT:  ! %bb.25: ! %overflow
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_26: ! %overflow
+; SPARC-NEXT:    or %l6, %o2, %l3
+; SPARC-NEXT:    cmp %l4, %i5
+; SPARC-NEXT:    or %l7, %g4, %g4
+; SPARC-NEXT:    bcs .LBB0_28
+; SPARC-NEXT:    mov %l1, %i5
+; SPARC-NEXT:  ! %bb.27: ! %overflow
+; SPARC-NEXT:    mov %g0, %i5
+; SPARC-NEXT:  .LBB0_28: ! %overflow
+; SPARC-NEXT:    or %l3, %g3, %g3
+; SPARC-NEXT:    cmp %l2, 0
+; SPARC-NEXT:    or %g4, %i0, %g4
 ; SPARC-NEXT:    bne .LBB0_30
-; SPARC-NEXT:    or %l1, %l0, %i0
-; SPARC-NEXT:  ! %bb.29: ! %start
-; SPARC-NEXT:    mov %g0, %l3
-; SPARC-NEXT:  .LBB0_30: ! %start
-; SPARC-NEXT:    and %l3, %i4, %i1
-; SPARC-NEXT:    or %i1, %i0, %i0
+; SPARC-NEXT:    mov %l1, %i0
+; SPARC-NEXT:  ! %bb.29: ! %overflow
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_30: ! %overflow
+; SPARC-NEXT:    or %g3, %i4, %i4
+; SPARC-NEXT:    cmp %l0, 0
+; SPARC-NEXT:    bne .LBB0_32
+; SPARC-NEXT:    or %g4, %i5, %i5
+; SPARC-NEXT:  ! %bb.31: ! %overflow
+; SPARC-NEXT:    mov %g0, %l1
+; SPARC-NEXT:  .LBB0_32: ! %overflow
+; SPARC-NEXT:    and %l1, %i0, %i0
 ; SPARC-NEXT:    or %i0, %i5, %i0
+; SPARC-NEXT:    or %i0, %i4, %i0
+; SPARC-NEXT:    ba .LBB0_49
 ; SPARC-NEXT:    or %i0, %l5, %i0
+; SPARC-NEXT:  .LBB0_33: ! %overflow.no.lhs
+; SPARC-NEXT:    or %i5, %i4, %i2
+; SPARC-NEXT:    cmp %i2, 0
+; SPARC-NEXT:    be .LBB0_48
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.34: ! %overflow.no.lhs.only
+; SPARC-NEXT:    umul %g3, %l1, %i2
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    umul %g2, %l1, %i3
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addcc %i2, %l2, %i2
+; SPARC-NEXT:    addxcc %l0, 0, %l0
+; SPARC-NEXT:    umul %g2, %g4, %l2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addcc %l2, %i2, %i2
+; SPARC-NEXT:    addxcc %l3, 0, %l2
+; SPARC-NEXT:    addcc %l0, %l2, %l0
+; SPARC-NEXT:    addxcc %g0, 0, %l2
+; SPARC-NEXT:    umul %g3, %g4, %l3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l3, %l0, %l0
+; SPARC-NEXT:    smul %l1, %i0, %l3
+; SPARC-NEXT:    umul %l1, %i1, %l1
+; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    addxcc %l4, %l2, %l2
+; SPARC-NEXT:    add %l5, %l3, %l3
+; SPARC-NEXT:    smul %g4, %i1, %g4
+; SPARC-NEXT:    add %l3, %g4, %g4
+; SPARC-NEXT:    addcc %l0, %l1, %l0
+; SPARC-NEXT:    umul %g2, %i5, %l1
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addxcc %l2, %g4, %g4
+; SPARC-NEXT:    umul %g3, %i5, %l2
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l2, %l3, %l2
+; SPARC-NEXT:    addxcc %l4, 0, %l3
+; SPARC-NEXT:    umul %g2, %i4, %g2
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g2, %l2, %g2
+; SPARC-NEXT:    addxcc %l4, 0, %l2
+; SPARC-NEXT:    addcc %l3, %l2, %l2
+; SPARC-NEXT:    addxcc %g0, 0, %l3
+; SPARC-NEXT:    umul %g3, %i4, %g3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g3, %l2, %g3
+; SPARC-NEXT:    smul %i5, %i0, %i0
+; SPARC-NEXT:    umul %i5, %i1, %i5
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addxcc %l4, %l3, %l3
+; SPARC-NEXT:    add %l2, %i0, %i0
+; SPARC-NEXT:    smul %i4, %i1, %i1
+; SPARC-NEXT:    add %i0, %i1, %i0
+; SPARC-NEXT:    addcc %g3, %i5, %i4
+; SPARC-NEXT:    addxcc %l3, %i0, %i5
+; SPARC-NEXT:    addcc %l0, %l1, %i1
+; SPARC-NEXT:    addxcc %g4, %g2, %g2
+; SPARC-NEXT:    mov 1, %i0
+; SPARC-NEXT:    cmp %g2, %g4
+; SPARC-NEXT:    bcs .LBB0_36
+; SPARC-NEXT:    mov %i0, %g3
+; SPARC-NEXT:  ! %bb.35: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %g3
+; SPARC-NEXT:  .LBB0_36: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %i1, %l0
+; SPARC-NEXT:    bcs .LBB0_38
+; SPARC-NEXT:    mov %i0, %l0
+; SPARC-NEXT:  ! %bb.37: ! %overflow.no.lhs.only
+; SPARC-NEXT:    mov %g0, %l0
+; SPARC-NEXT:  .LBB0_38: ! %overflow.no.lhs.only
+; SPARC-NEXT:    cmp %g2, %g4
+; SPARC-NEXT:    be .LBB0_46
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.39: ! %overflow.no.lhs.only
+; SPARC-NEXT:    ba .LBB0_46
+; SPARC-NEXT:    mov %g3, %l0
+; SPARC-NEXT:  .LBB0_40: ! %overflow.no.rhs.only
+; SPARC-NEXT:    umul %g4, %g2, %i2
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    umul %l1, %g2, %i3
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addcc %i2, %l2, %i2
+; SPARC-NEXT:    addxcc %l0, 0, %l0
+; SPARC-NEXT:    umul %l1, %g3, %l2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addcc %l2, %i2, %i2
+; SPARC-NEXT:    addxcc %l3, 0, %l2
+; SPARC-NEXT:    addcc %l0, %l2, %l0
+; SPARC-NEXT:    addxcc %g0, 0, %l2
+; SPARC-NEXT:    umul %g4, %g3, %l3
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l3, %l0, %l0
+; SPARC-NEXT:    smul %g2, %i4, %l3
+; SPARC-NEXT:    umul %g2, %i5, %g2
+; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    addxcc %l4, %l2, %l2
+; SPARC-NEXT:    add %l5, %l3, %l3
+; SPARC-NEXT:    smul %g3, %i5, %g3
+; SPARC-NEXT:    add %l3, %g3, %g3
+; SPARC-NEXT:    addcc %l0, %g2, %l0
+; SPARC-NEXT:    umul %l1, %i1, %g2
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addxcc %l2, %g3, %g3
+; SPARC-NEXT:    umul %g4, %i1, %l2
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l2, %l3, %l2
+; SPARC-NEXT:    addxcc %l4, 0, %l3
+; SPARC-NEXT:    umul %l1, %i0, %l1
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %l1, %l2, %l1
+; SPARC-NEXT:    addxcc %l4, 0, %l2
+; SPARC-NEXT:    addcc %l3, %l2, %l2
+; SPARC-NEXT:    addxcc %g0, 0, %l3
+; SPARC-NEXT:    umul %g4, %i0, %g4
+; SPARC-NEXT:    rd %y, %l4
+; SPARC-NEXT:    addcc %g4, %l2, %g4
+; SPARC-NEXT:    smul %i1, %i4, %i4
+; SPARC-NEXT:    umul %i1, %i5, %i1
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    addxcc %l4, %l3, %l3
+; SPARC-NEXT:    add %l2, %i4, %i4
+; SPARC-NEXT:    smul %i0, %i5, %i0
+; SPARC-NEXT:    add %i4, %i0, %i0
+; SPARC-NEXT:    addcc %g4, %i1, %i4
+; SPARC-NEXT:    addxcc %l3, %i0, %i5
+; SPARC-NEXT:    addcc %l0, %g2, %i1
+; SPARC-NEXT:    addxcc %g3, %l1, %g2
+; SPARC-NEXT:    mov 1, %i0
+; SPARC-NEXT:    cmp %g2, %g3
+; SPARC-NEXT:    bcs .LBB0_42
+; SPARC-NEXT:    mov %i0, %g4
+; SPARC-NEXT:  ! %bb.41: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_42: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %i1, %l0
+; SPARC-NEXT:    bcs .LBB0_44
+; SPARC-NEXT:    mov %i0, %l0
+; SPARC-NEXT:  ! %bb.43: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g0, %l0
+; SPARC-NEXT:  .LBB0_44: ! %overflow.no.rhs.only
+; SPARC-NEXT:    cmp %g2, %g3
+; SPARC-NEXT:    be .LBB0_46
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.45: ! %overflow.no.rhs.only
+; SPARC-NEXT:    mov %g4, %l0
+; SPARC-NEXT:  .LBB0_46: ! %overflow.no.rhs.only
+; SPARC-NEXT:    addcc %i4, %l0, %i4
+; SPARC-NEXT:    addxcc %i5, 0, %i5
+; SPARC-NEXT:    or %i4, %i5, %i4
+; SPARC-NEXT:    cmp %i4, 0
+; SPARC-NEXT:    bne .LBB0_49
+; SPARC-NEXT:    nop
+; SPARC-NEXT:  ! %bb.47: ! %overflow.no.rhs.only
+; SPARC-NEXT:    ba .LBB0_49
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:  .LBB0_48: ! %overflow.no
+; SPARC-NEXT:    smul %l1, %i0, %i3
+; SPARC-NEXT:    umul %l1, %i1, %i2
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    mov %g0, %i0
+; SPARC-NEXT:    add %l0, %i3, %i3
+; SPARC-NEXT:    smul %g4, %i1, %i1
+; SPARC-NEXT:    smul %i5, %g3, %l0
+; SPARC-NEXT:    umul %i5, %g2, %i5
+; SPARC-NEXT:    rd %y, %l2
+; SPARC-NEXT:    add %i3, %i1, %i1
+; SPARC-NEXT:    add %l2, %l0, %i3
+; SPARC-NEXT:    smul %i4, %g2, %i4
+; SPARC-NEXT:    add %i3, %i4, %i4
+; SPARC-NEXT:    addcc %i5, %i2, %i5
+; SPARC-NEXT:    umul %g2, %l1, %i3
+; SPARC-NEXT:    rd %y, %i2
+; SPARC-NEXT:    addxcc %i4, %i1, %i4
+; SPARC-NEXT:    umul %g3, %l1, %i1
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    addcc %i1, %i2, %i1
+; SPARC-NEXT:    addxcc %l0, 0, %l0
+; SPARC-NEXT:    umul %g2, %g4, %i2
+; SPARC-NEXT:    rd %y, %g2
+; SPARC-NEXT:    addcc %i2, %i1, %i2
+; SPARC-NEXT:    addxcc %g2, 0, %i1
+; SPARC-NEXT:    addcc %l0, %i1, %i1
+; SPARC-NEXT:    addxcc %g0, 0, %g2
+; SPARC-NEXT:    umul %g3, %g4, %g3
+; SPARC-NEXT:    rd %y, %g4
+; SPARC-NEXT:    addcc %g3, %i1, %i1
+; SPARC-NEXT:    addxcc %g4, %g2, %g2
+; SPARC-NEXT:    addcc %i1, %i5, %i1
+; SPARC-NEXT:    addxcc %g2, %i4, %g2
+; SPARC-NEXT:  .LBB0_49: ! %overflow.res
 ; SPARC-NEXT:    and %i0, 1, %i4
-; SPARC-NEXT:    mov %g3, %i0
 ; SPARC-NEXT:    ret
-; SPARC-NEXT:    restore %g0, %g2, %o1
+; SPARC-NEXT:    restore %g0, %g2, %o0
 ;
 ; SPARC64-LABEL: muloti_test:
 ; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
-; SPARC64-NEXT:  ! %bb.0: ! %start
+; SPARC64-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    mov %i0, %l1
+; SPARC64-NEXT:    brz %i0, .LBB0_3
+; SPARC64-NEXT:    mov %i1, %i4
+; SPARC64-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-NEXT:    brz %i2, .LBB0_5
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.2: ! %overflow
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i1, %o3
-; SPARC64-NEXT:    mov %o0, %i4
-; SPARC64-NEXT:    mov %o1, %i5
+; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %o1, %i1
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i0, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o0, %l0
-; SPARC64-NEXT:    add %o1, %i5, %i0
+; SPARC64-NEXT:    add %o1, %i1, %l1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i1, %o1
+; SPARC64-NEXT:    mov %i4, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    mov %g0, %i1
-; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    mov %g0, %i5
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %g0, %i4
 ; SPARC64-NEXT:    mov %g0, %g2
 ; SPARC64-NEXT:    mov %g0, %g3
-; SPARC64-NEXT:    add %o0, %i0, %i0
-; SPARC64-NEXT:    cmp %i0, %o0
-; SPARC64-NEXT:    movrnz %l0, 1, %i3
-; SPARC64-NEXT:    movrnz %i2, 1, %i5
-; SPARC64-NEXT:    movrnz %l1, 1, %g2
-; SPARC64-NEXT:    movcs %xcc, 1, %i1
-; SPARC64-NEXT:    and %g2, %i5, %i2
-; SPARC64-NEXT:    or %i2, %i3, %i2
-; SPARC64-NEXT:    movrnz %i4, 1, %g3
-; SPARC64-NEXT:    or %i2, %g3, %i2
-; SPARC64-NEXT:    or %i2, %i1, %i1
-; SPARC64-NEXT:    srl %i1, 0, %i2
+; SPARC64-NEXT:    mov %g0, %g4
+; SPARC64-NEXT:    mov %g0, %g5
+; SPARC64-NEXT:    add %o0, %l1, %i3
+; SPARC64-NEXT:    cmp %i3, %o0
+; SPARC64-NEXT:    movrnz %i2, 1, %g2
+; SPARC64-NEXT:    movrnz %i0, 1, %g3
+; SPARC64-NEXT:    and %g3, %g2, %i0
+; SPARC64-NEXT:    movcs %xcc, 1, %i4
+; SPARC64-NEXT:    movrnz %l0, 1, %g4
+; SPARC64-NEXT:    or %i0, %g4, %i0
+; SPARC64-NEXT:    movrnz %i5, 1, %g5
+; SPARC64-NEXT:    or %i0, %g5, %i0
+; SPARC64-NEXT:    ba .LBB0_8
+; SPARC64-NEXT:    or %i0, %i4, %i0
+; SPARC64-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-NEXT:    brz %i2, .LBB0_7
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i5
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i2, %o3
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:    add %i5, %o1, %i3
+; SPARC64-NEXT:    ba .LBB0_6
+; SPARC64-NEXT:    cmp %i3, %i5
+; SPARC64-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i4, %o3
+; SPARC64-NEXT:    mov %o0, %i4
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %i2, %o0
+; SPARC64-NEXT:    mov %i3, %o1
+; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i0, %o3
+; SPARC64-NEXT:    mov %g0, %i2
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:    add %i4, %o1, %i3
+; SPARC64-NEXT:    cmp %i3, %i4
+; SPARC64-NEXT:  .LBB0_6: ! %overflow.res
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
+; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    add %o0, %i2, %i2
+; SPARC64-NEXT:    ba .LBB0_8
+; SPARC64-NEXT:    movrnz %i2, 1, %i0
+; SPARC64-NEXT:  .LBB0_7: ! %overflow.no
+; SPARC64-NEXT:    mov %i0, %o0
+; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i2, %o2
+; SPARC64-NEXT:    call __multi3
+; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %o0, %i3
+; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:  .LBB0_8: ! %overflow.res
+; SPARC64-NEXT:    and %i0, 1, %i2
 ; SPARC64-NEXT:    ret
-; SPARC64-NEXT:    restore %g0, %o1, %o1
+; SPARC64-NEXT:    restore %g0, %i3, %o0
 ;
 ; SPARC64-VIS3-LABEL: muloti_test:
 ; SPARC64-VIS3:         .register %g2, #scratch
 ; SPARC64-VIS3-NEXT:    .register %g3, #scratch
-; SPARC64-VIS3-NEXT:  ! %bb.0: ! %start
+; SPARC64-VIS3-NEXT:  ! %bb.0: ! %overflow.entry
 ; SPARC64-VIS3-NEXT:    save %sp, -128, %sp
+; SPARC64-VIS3-NEXT:    brz %i0, .LBB0_3
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.1: ! %overflow.lhs
+; SPARC64-VIS3-NEXT:    brz %i2, .LBB0_5
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.2: ! %overflow
 ; SPARC64-VIS3-NEXT:    mov %g0, %i5
 ; SPARC64-VIS3-NEXT:    mov %g0, %g2
 ; SPARC64-VIS3-NEXT:    mov %g0, %g3
@@ -227,9 +490,59 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC64-VIS3-NEXT:    umulxhi %i2, %i1, %i2
 ; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %g5
 ; SPARC64-VIS3-NEXT:    or %i0, %g5, %i0
-; SPARC64-VIS3-NEXT:    or %i0, %i5, %i0
+; SPARC64-VIS3-NEXT:    ba .LBB0_7
+; SPARC64-VIS3-NEXT:    or %i0, %i5, %i5
+; SPARC64-VIS3-NEXT:  .LBB0_3: ! %overflow.no.lhs
+; SPARC64-VIS3-NEXT:    brz %i2, .LBB0_6
+; SPARC64-VIS3-NEXT:    nop
+; SPARC64-VIS3-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i4
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
+; SPARC64-VIS3-NEXT:    add %g3, %i4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %i4
+; SPARC64-VIS3-NEXT:    add %i4, %i0, %i0
+; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i3
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
+; SPARC64-VIS3-NEXT:    mov %i3, %i1
+; SPARC64-VIS3-NEXT:    add %g3, %i2, %i4
+; SPARC64-VIS3-NEXT:    cmp %i4, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-VIS3-NEXT:    srl %g2, 0, %i2
+; SPARC64-VIS3-NEXT:    add %i0, %i2, %i0
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
+; SPARC64-VIS3-NEXT:    mov %g0, %g2
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mulx %i2, %i1, %i4
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i1, %g3
+; SPARC64-VIS3-NEXT:    add %g3, %i4, %g3
+; SPARC64-VIS3-NEXT:    mulx %i2, %i0, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i3, %i0, %i4
+; SPARC64-VIS3-NEXT:    add %i4, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i3, %i1, %i1
+; SPARC64-VIS3-NEXT:    mulx %i3, %i0, %i0
+; SPARC64-VIS3-NEXT:    add %g3, %i0, %i4
+; SPARC64-VIS3-NEXT:    cmp %i4, %g3
+; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g2
+; SPARC64-VIS3-NEXT:    srl %g2, 0, %i0
+; SPARC64-VIS3-NEXT:    add %i2, %i0, %i0
+; SPARC64-VIS3-NEXT:    ba .LBB0_8
+; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
+; SPARC64-VIS3-NEXT:  .LBB0_6: ! %overflow.no
+; SPARC64-VIS3-NEXT:    mov %g0, %i5
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %i4
+; SPARC64-VIS3-NEXT:    add %i4, %i2, %i2
+; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i0
+; SPARC64-VIS3-NEXT:    add %i2, %i0, %i4
+; SPARC64-VIS3-NEXT:  .LBB0_7: ! %overflow.res
 ; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i1
-; SPARC64-VIS3-NEXT:    srl %i0, 0, %i2
+; SPARC64-VIS3-NEXT:  .LBB0_8: ! %overflow.res
+; SPARC64-VIS3-NEXT:    and %i5, 1, %i2
 ; SPARC64-VIS3-NEXT:    ret
 ; SPARC64-VIS3-NEXT:    restore %g0, %i4, %o0
 start:
diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index 9b5fa1c2bc811..c19ce3f34011e 100644
--- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -3,200 +3,568 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV6-LABEL: muloti_test:
-; THUMBV6:       @ %bb.0: @ %start
+; THUMBV6:       @ %bb.0: @ %overflow.entry
 ; THUMBV6-NEXT:    .save {r4, r5, r6, r7, lr}
 ; THUMBV6-NEXT:    push {r4, r5, r6, r7, lr}
-; THUMBV6-NEXT:    .pad #60
-; THUMBV6-NEXT:    sub sp, #60
+; THUMBV6-NEXT:    .pad #84
+; THUMBV6-NEXT:    sub sp, #84
 ; THUMBV6-NEXT:    mov r6, r3
-; THUMBV6-NEXT:    mov r1, r2
-; THUMBV6-NEXT:    str r2, [sp, #52] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r4, r0
+; THUMBV6-NEXT:    str r0, [sp, #48] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #108]
+; THUMBV6-NEXT:    ldr r5, [sp, #104]
+; THUMBV6-NEXT:    str r5, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    str r0, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    orrs r5, r0
+; THUMBV6-NEXT:    ldr r1, [sp, #124]
+; THUMBV6-NEXT:    ldr r4, [sp, #120]
+; THUMBV6-NEXT:    ldr r0, [sp, #116]
+; THUMBV6-NEXT:    str r0, [sp, #68] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r3, [sp, #112]
+; THUMBV6-NEXT:    str r4, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    str r2, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    str r6, [sp, #76] @ 4-byte Spill
+; THUMBV6-NEXT:    str r3, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    bne .LBB0_1
+; THUMBV6-NEXT:    b .LBB0_3
+; THUMBV6-NEXT:  .LBB0_1: @ %overflow.lhs
+; THUMBV6-NEXT:    orrs r4, r1
+; THUMBV6-NEXT:    bne .LBB0_2
+; THUMBV6-NEXT:    b .LBB0_5
+; THUMBV6-NEXT:  .LBB0_2: @ %overflow
+; THUMBV6-NEXT:    str r4, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    movs r4, #0
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
 ; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r2, [sp, #88]
-; THUMBV6-NEXT:    str r2, [sp, #48] @ 4-byte Spill
-; THUMBV6-NEXT:    movs r5, #0
-; THUMBV6-NEXT:    mov r0, r1
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r1, r0
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    adcs r1, r4
 ; THUMBV6-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    str r0, [r4]
-; THUMBV6-NEXT:    ldr r2, [sp, #96]
-; THUMBV6-NEXT:    str r2, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r4, r6
-; THUMBV6-NEXT:    str r6, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; THUMBV6-NEXT:    str r5, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r5, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r7, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r2, r1, r2
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    str r1, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r2, r0
+; THUMBV6-NEXT:    str r2, [sp, #8] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r6, [sp, #72] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    mov r7, r1
-; THUMBV6-NEXT:    subs r0, r1, #1
-; THUMBV6-NEXT:    sbcs r7, r0
-; THUMBV6-NEXT:    ldr r0, [sp, #100]
-; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r6, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #68] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    subs r2, r1, #1
-; THUMBV6-NEXT:    sbcs r1, r2
-; THUMBV6-NEXT:    subs r2, r4, #1
+; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r7, r1
+; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r7
+; THUMBV6-NEXT:    str r1, [sp] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    str r1, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r7, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r7
+; THUMBV6-NEXT:    mov r2, r4
 ; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r0, r0, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r0
+; THUMBV6-NEXT:    adcs r4, r4
+; THUMBV6-NEXT:    ldr r3, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
 ; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r4, #1
-; THUMBV6-NEXT:    sbcs r4, r2
-; THUMBV6-NEXT:    ands r4, r3
-; THUMBV6-NEXT:    orrs r4, r1
-; THUMBV6-NEXT:    orrs r4, r7
-; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r7, r1, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r0, #1
+; THUMBV6-NEXT:    sbcs r0, r2
+; THUMBV6-NEXT:    subs r2, r7, #1
+; THUMBV6-NEXT:    sbcs r7, r2
+; THUMBV6-NEXT:    mov r6, r7
+; THUMBV6-NEXT:    ldr r7, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r7, #1
+; THUMBV6-NEXT:    sbcs r7, r2
+; THUMBV6-NEXT:    ands r7, r6
+; THUMBV6-NEXT:    orrs r7, r0
+; THUMBV6-NEXT:    orrs r7, r3
+; THUMBV6-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r7, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r0, #1
+; THUMBV6-NEXT:    sbcs r0, r2
+; THUMBV6-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    mov r6, r3
+; THUMBV6-NEXT:    ldr r3, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    subs r2, r5, #1
+; THUMBV6-NEXT:    sbcs r5, r2
+; THUMBV6-NEXT:    ands r5, r3
+; THUMBV6-NEXT:    orrs r5, r6
+; THUMBV6-NEXT:    orrs r5, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r5, r2
+; THUMBV6-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    mov r6, r3
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r2, #1
+; THUMBV6-NEXT:    ldr r3, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    str r3, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    ands r2, r6
+; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r2, r5
+; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r5, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r5, r7
+; THUMBV6-NEXT:    orrs r5, r4
+; THUMBV6-NEXT:    b .LBB0_8
+; THUMBV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; THUMBV6-NEXT:    mov r0, r4
+; THUMBV6-NEXT:    orrs r0, r1
+; THUMBV6-NEXT:    bne .LBB0_4
+; THUMBV6-NEXT:    b .LBB0_7
+; THUMBV6-NEXT:  .LBB0_4: @ %overflow.no.lhs.only
+; THUMBV6-NEXT:    mov r5, r4
+; THUMBV6-NEXT:    movs r4, #0
+; THUMBV6-NEXT:    mov r0, r2
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r7, r2
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    str r5, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r4
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r5, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r6, r1
+; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r7
 ; THUMBV6-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r7, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    ldr r5, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    adds r0, r1, r7
-; THUMBV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r5
-; THUMBV6-NEXT:    adcs r0, r5
-; THUMBV6-NEXT:    orrs r0, r4
-; THUMBV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #92]
+; THUMBV6-NEXT:    str r1, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r3, r0, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r2, r1
+; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r3
+; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r2, r6
+; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r7, [sp, #80]
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r7, r1
+; THUMBV6-NEXT:    adds r6, r0, r6
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r5, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r0, r0, r6
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r7, r1
+; THUMBV6-NEXT:    str r0, [sp, #72] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #76] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r7
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r5, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    adcs r1, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    b .LBB0_6
+; THUMBV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; THUMBV6-NEXT:    movs r4, #0
+; THUMBV6-NEXT:    mov r0, r3
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #56] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r5, r3
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r4, r1
-; THUMBV6-NEXT:    subs r0, r1, #1
-; THUMBV6-NEXT:    sbcs r4, r0
-; THUMBV6-NEXT:    ldr r6, [sp, #84]
+; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r7, r0, r1
+; THUMBV6-NEXT:    adcs r6, r4
+; THUMBV6-NEXT:    mov r0, r5
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r5, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r0, r0, r7
+; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r6, r1
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r6, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
 ; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r7
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #60] @ 4-byte Reload
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; THUMBV6-NEXT:    subs r2, r1, #1
-; THUMBV6-NEXT:    sbcs r1, r2
-; THUMBV6-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r3, #1
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
-; THUMBV6-NEXT:    subs r2, r6, #1
-; THUMBV6-NEXT:    sbcs r6, r2
-; THUMBV6-NEXT:    ands r6, r3
-; THUMBV6-NEXT:    orrs r6, r1
-; THUMBV6-NEXT:    orrs r6, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r1, r0
-; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT:    str r0, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r5, r1
+; THUMBV6-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r4, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r6
 ; THUMBV6-NEXT:    mov r2, r4
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    mov r3, r4
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r1, r0
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    adcs r1, r5
-; THUMBV6-NEXT:    orrs r1, r6
-; THUMBV6-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r2, r0, r2
+; THUMBV6-NEXT:    adcs r5, r1
+; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #56] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r5, r0
+; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r7, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    ldr r6, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:    adds r0, r7, r1
+; THUMBV6-NEXT:    str r0, [sp, #64] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r7, r4
+; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #68] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r6, r7
+; THUMBV6-NEXT:    add r2, sp, #72
+; THUMBV6-NEXT:    ldm r2, {r0, r1, r2} @ 12-byte Folded Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    adcs r1, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
 ; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r3, r2
-; THUMBV6-NEXT:    subs r2, r3, #1
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r7, r2
-; THUMBV6-NEXT:    subs r2, r7, #1
-; THUMBV6-NEXT:    sbcs r7, r2
-; THUMBV6-NEXT:    ands r7, r3
-; THUMBV6-NEXT:    orrs r7, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r7, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r1, r2, r1
-; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:  .LBB0_6: @ %overflow.res
+; THUMBV6-NEXT:    adcs r2, r4
+; THUMBV6-NEXT:    adcs r5, r4
+; THUMBV6-NEXT:    orrs r5, r2
+; THUMBV6-NEXT:    subs r2, r5, #1
+; THUMBV6-NEXT:    sbcs r5, r2
+; THUMBV6-NEXT:    b .LBB0_8
+; THUMBV6-NEXT:  .LBB0_7: @ %overflow.no
+; THUMBV6-NEXT:    movs r5, #0
+; THUMBV6-NEXT:    mov r0, r2
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    mov r7, r2
+; THUMBV6-NEXT:    mov r2, r3
+; THUMBV6-NEXT:    mov r4, r3
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r4, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r6, r0, r1
-; THUMBV6-NEXT:    adcs r4, r5
-; THUMBV6-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r0, r6
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r4
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    adds r0, r0, r6
-; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    str r0, [r2, #4]
-; THUMBV6-NEXT:    adcs r1, r5
-; THUMBV6-NEXT:    adds r0, r4, r1
-; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r6, r5
+; THUMBV6-NEXT:    mov r4, r6
+; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
 ; THUMBV6-NEXT:    adcs r6, r5
-; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r0, r7
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r4, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    ldr r7, [sp, #68] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r7
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
 ; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r6
-; THUMBV6-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #48] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r5
+; THUMBV6-NEXT:    adds r0, r6, r1
+; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r6, r7
+; THUMBV6-NEXT:    mov r7, r5
+; THUMBV6-NEXT:    adcs r7, r5
+; THUMBV6-NEXT:    mov r0, r4
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r2, r6
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r0
 ; THUMBV6-NEXT:    mov r4, r1
-; THUMBV6-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r5
+; THUMBV6-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r4, r7
+; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r6
+; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    mov r6, r0
+; THUMBV6-NEXT:    mov r7, r1
+; THUMBV6-NEXT:    ldr r0, [sp, #80] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #76] @ 4-byte Reload
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    adds r0, r0, r6
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r7
+; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
 ; THUMBV6-NEXT:    adds r0, r2, r0
-; THUMBV6-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r4
+; THUMBV6-NEXT:  .LBB0_8: @ %overflow.res
+; THUMBV6-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    str r3, [r2]
+; THUMBV6-NEXT:    ldr r3, [sp, #40] @ 4-byte Reload
+; THUMBV6-NEXT:    str r3, [r2, #4]
 ; THUMBV6-NEXT:    str r0, [r2, #8]
-; THUMBV6-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r0
 ; THUMBV6-NEXT:    str r1, [r2, #12]
-; THUMBV6-NEXT:    adcs r5, r5
-; THUMBV6-NEXT:    orrs r5, r7
 ; THUMBV6-NEXT:    movs r0, #1
 ; THUMBV6-NEXT:    ands r0, r5
 ; THUMBV6-NEXT:    strb r0, [r2, #16]
-; THUMBV6-NEXT:    add sp, #60
+; THUMBV6-NEXT:    add sp, #84
 ; THUMBV6-NEXT:    pop {r4, r5, r6, r7, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
index fe1d06cb39e16..07cd9788d91e1 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@@ -3,125 +3,211 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-LABEL: muloti_test:
-; THUMBV7:       @ %bb.0: @ %start
+; THUMBV7:       @ %bb.0: @ %overflow.entry
 ; THUMBV7-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; THUMBV7-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; THUMBV7-NEXT:    .pad #44
-; THUMBV7-NEXT:    sub sp, #44
-; THUMBV7-NEXT:    ldr.w r8, [sp, #88]
-; THUMBV7-NEXT:    mov r9, r0
-; THUMBV7-NEXT:    ldr r7, [sp, #96]
-; THUMBV7-NEXT:    ldr.w lr, [sp, #100]
-; THUMBV7-NEXT:    umull r0, r5, r2, r8
-; THUMBV7-NEXT:    ldr r4, [sp, #80]
-; THUMBV7-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r1, r0, r3, r7
-; THUMBV7-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r0, r11, lr, r2
-; THUMBV7-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; THUMBV7-NEXT:    ldr r1, [sp, #92]
-; THUMBV7-NEXT:    str r0, [sp] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r0, r10, r7, r2
-; THUMBV7-NEXT:    mov r7, r1
-; THUMBV7-NEXT:    umull r6, r12, r1, r4
-; THUMBV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV7-NEXT:    ldr r0, [sp, #84]
-; THUMBV7-NEXT:    str r6, [sp, #24] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r6, r1, r0, r8
-; THUMBV7-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r6, r2, r2, r7
-; THUMBV7-NEXT:    mov r7, r4
-; THUMBV7-NEXT:    strd r6, r2, [sp, #8] @ 8-byte Folded Spill
-; THUMBV7-NEXT:    umull r2, r6, r4, r8
-; THUMBV7-NEXT:    str r2, [sp, #36] @ 4-byte Spill
-; THUMBV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV7-NEXT:    str r6, [sp, #28] @ 4-byte Spill
-; THUMBV7-NEXT:    movs r6, #0
-; THUMBV7-NEXT:    str.w r2, [r9]
-; THUMBV7-NEXT:    umlal r5, r6, r3, r8
-; THUMBV7-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
-; THUMBV7-NEXT:    ldr r4, [sp] @ 4-byte Reload
-; THUMBV7-NEXT:    add r4, r2
-; THUMBV7-NEXT:    adds.w r2, r10, r4
-; THUMBV7-NEXT:    str r2, [sp, #20] @ 4-byte Spill
-; THUMBV7-NEXT:    mov.w r2, #0
-; THUMBV7-NEXT:    adc r2, r2, #0
-; THUMBV7-NEXT:    cmp.w r12, #0
-; THUMBV7-NEXT:    str r2, [sp, #32] @ 4-byte Spill
+; THUMBV7-NEXT:    .pad #12
+; THUMBV7-NEXT:    sub sp, #12
+; THUMBV7-NEXT:    ldrd r11, r6, [sp, #48]
+; THUMBV7-NEXT:    ldrd r10, r5, [sp, #64]
+; THUMBV7-NEXT:    ldrd r9, r12, [sp, #56]
+; THUMBV7-NEXT:    orrs.w r1, r11, r6
+; THUMBV7-NEXT:    beq .LBB0_3
+; THUMBV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; THUMBV7-NEXT:    orr.w r4, r10, r5
+; THUMBV7-NEXT:    cmp r4, #0
+; THUMBV7-NEXT:    beq.w .LBB0_5
+; THUMBV7-NEXT:  @ %bb.2: @ %overflow
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w r12, #1
+; THUMBV7-NEXT:    movne r4, #1
 ; THUMBV7-NEXT:    cmp r1, #0
-; THUMBV7-NEXT:    ldr r2, [sp, #96]
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
-; THUMBV7-NEXT:    orrs.w r10, r7, r0
-; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w r10, #1
-; THUMBV7-NEXT:    orrs.w r7, r2, lr
-; THUMBV7-NEXT:    ldr r2, [sp, #92]
-; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r7, #1
-; THUMBV7-NEXT:    cmp r0, #0
+; THUMBV7-NEXT:    and.w lr, r1, r4
+; THUMBV7-NEXT:    umull r7, r4, r6, r9
+; THUMBV7-NEXT:    cmp.w r12, #0
+; THUMBV7-NEXT:    mov r1, r12
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r0, #1
-; THUMBV7-NEXT:    cmp r2, #0
-; THUMBV7-NEXT:    mov r4, r2
-; THUMBV7-NEXT:    mov r8, r2
+; THUMBV7-NEXT:    movne r1, #1
+; THUMBV7-NEXT:    cmp r6, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r4, #1
-; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; THUMBV7-NEXT:    ands r0, r4
-; THUMBV7-NEXT:    movs r4, #0
-; THUMBV7-NEXT:    adds r5, r5, r2
-; THUMBV7-NEXT:    str.w r5, [r9, #4]
-; THUMBV7-NEXT:    orr.w r0, r0, r1
-; THUMBV7-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; THUMBV7-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
-; THUMBV7-NEXT:    and.w r5, r10, r7
-; THUMBV7-NEXT:    orr.w r0, r0, r12
-; THUMBV7-NEXT:    mov.w r12, #0
-; THUMBV7-NEXT:    add r1, r2
-; THUMBV7-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; THUMBV7-NEXT:    adcs r2, r6
-; THUMBV7-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
-; THUMBV7-NEXT:    adc r7, r4, #0
-; THUMBV7-NEXT:    adds r1, r1, r6
-; THUMBV7-NEXT:    umlal r2, r7, r3, r8
-; THUMBV7-NEXT:    adc r4, r4, #0
-; THUMBV7-NEXT:    orrs r0, r4
-; THUMBV7-NEXT:    orrs r0, r5
-; THUMBV7-NEXT:    ldrd r5, r4, [sp, #36] @ 8-byte Folded Reload
-; THUMBV7-NEXT:    adds r5, r5, r4
-; THUMBV7-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; THUMBV7-NEXT:    adcs r1, r4
-; THUMBV7-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    ands r1, r6
 ; THUMBV7-NEXT:    cmp r4, #0
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    orrs r1, r4
+; THUMBV7-NEXT:    umull r4, r6, r12, r11
+; THUMBV7-NEXT:    cmp r6, #0
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    orrs r6, r1
+; THUMBV7-NEXT:    adds r1, r7, r4
+; THUMBV7-NEXT:    umull r11, r4, r11, r9
+; THUMBV7-NEXT:    adds.w r8, r4, r1
+; THUMBV7-NEXT:    mov.w r1, #0
+; THUMBV7-NEXT:    adc r4, r1, #0
 ; THUMBV7-NEXT:    cmp r3, #0
+; THUMBV7-NEXT:    orr.w r4, r4, r6
+; THUMBV7-NEXT:    umull r7, r6, r5, r2
+; THUMBV7-NEXT:    orr.w lr, lr, r4
+; THUMBV7-NEXT:    mov r4, r3
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    cmp r5, #0
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne r5, #1
+; THUMBV7-NEXT:    ands r4, r5
+; THUMBV7-NEXT:    cmp r6, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r3, #1
-; THUMBV7-NEXT:    cmp.w lr, #0
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    orrs r4, r6
+; THUMBV7-NEXT:    umull r5, r6, r3, r10
+; THUMBV7-NEXT:    cmp r6, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w lr, #1
-; THUMBV7-NEXT:    cmp.w r11, #0
+; THUMBV7-NEXT:    movne r6, #1
+; THUMBV7-NEXT:    orrs r4, r6
+; THUMBV7-NEXT:    add r5, r7
+; THUMBV7-NEXT:    umull r6, r7, r10, r2
+; THUMBV7-NEXT:    adds r5, r5, r7
+; THUMBV7-NEXT:    adc r7, r1, #0
+; THUMBV7-NEXT:    adds.w r6, r6, r11
+; THUMBV7-NEXT:    orr.w r4, r4, r7
+; THUMBV7-NEXT:    mov.w r7, #0
+; THUMBV7-NEXT:    orr.w lr, lr, r4
+; THUMBV7-NEXT:    umull r11, r4, r2, r9
+; THUMBV7-NEXT:    adc.w r10, r8, r5
+; THUMBV7-NEXT:    umlal r4, r7, r3, r9
+; THUMBV7-NEXT:    umull r2, r5, r2, r12
+; THUMBV7-NEXT:    adds.w r8, r2, r4
+; THUMBV7-NEXT:    adcs.w r2, r7, r5
+; THUMBV7-NEXT:    adc r4, r1, #0
+; THUMBV7-NEXT:    umlal r2, r4, r3, r12
+; THUMBV7-NEXT:    adds r2, r2, r6
+; THUMBV7-NEXT:    adcs.w r3, r4, r10
+; THUMBV7-NEXT:    adc r1, r1, #0
+; THUMBV7-NEXT:    orr.w r1, r1, lr
+; THUMBV7-NEXT:    b .LBB0_8
+; THUMBV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; THUMBV7-NEXT:    orrs.w r1, r10, r5
+; THUMBV7-NEXT:    beq.w .LBB0_7
+; THUMBV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; THUMBV7-NEXT:    umull r1, lr, r2, r10
+; THUMBV7-NEXT:    movs r7, #0
+; THUMBV7-NEXT:    umlal lr, r7, r3, r10
+; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r4, r8, r2, r5
+; THUMBV7-NEXT:    adds.w r1, r4, lr
+; THUMBV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; THUMBV7-NEXT:    adcs.w r7, r7, r8
+; THUMBV7-NEXT:    mov.w r1, #0
+; THUMBV7-NEXT:    adc lr, r1, #0
+; THUMBV7-NEXT:    umull r8, r1, r10, r11
+; THUMBV7-NEXT:    mla r1, r10, r6, r1
+; THUMBV7-NEXT:    umlal r7, lr, r3, r5
+; THUMBV7-NEXT:    mla r1, r5, r11, r1
+; THUMBV7-NEXT:    adds.w r5, r7, r8
+; THUMBV7-NEXT:    umull r4, r7, r2, r9
+; THUMBV7-NEXT:    adc.w r10, lr, r1
+; THUMBV7-NEXT:    movs r1, #0
+; THUMBV7-NEXT:    umlal r7, r1, r3, r9
+; THUMBV7-NEXT:    umull r2, lr, r2, r12
+; THUMBV7-NEXT:    adds.w r8, r2, r7
+; THUMBV7-NEXT:    mov.w r2, #0
+; THUMBV7-NEXT:    adcs.w r1, r1, lr
+; THUMBV7-NEXT:    adc r2, r2, #0
+; THUMBV7-NEXT:    umlal r1, r2, r3, r12
+; THUMBV7-NEXT:    umull r3, r7, r9, r11
+; THUMBV7-NEXT:    mla r7, r9, r6, r7
+; THUMBV7-NEXT:    adds r1, r1, r3
+; THUMBV7-NEXT:    mla r7, r12, r11, r7
+; THUMBV7-NEXT:    mov r11, r4
+; THUMBV7-NEXT:    adc.w r3, r2, r7
+; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; THUMBV7-NEXT:    adds r2, r2, r1
+; THUMBV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r3, r1
+; THUMBV7-NEXT:    adcs r1, r5, #0
+; THUMBV7-NEXT:    adc r7, r10, #0
+; THUMBV7-NEXT:    b .LBB0_6
+; THUMBV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; THUMBV7-NEXT:    umull r1, r4, r9, r11
+; THUMBV7-NEXT:    movs r7, #0
+; THUMBV7-NEXT:    mov.w r8, #0
+; THUMBV7-NEXT:    umlal r4, r7, r12, r11
+; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r1, lr, r9, r6
+; THUMBV7-NEXT:    adds r1, r1, r4
+; THUMBV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; THUMBV7-NEXT:    adcs.w r7, r7, lr
+; THUMBV7-NEXT:    umull lr, r1, r11, r10
+; THUMBV7-NEXT:    adc r4, r8, #0
+; THUMBV7-NEXT:    mla r1, r11, r5, r1
+; THUMBV7-NEXT:    umlal r7, r4, r12, r6
+; THUMBV7-NEXT:    mla r1, r6, r10, r1
+; THUMBV7-NEXT:    adds.w r7, r7, lr
+; THUMBV7-NEXT:    str r7, [sp] @ 4-byte Spill
+; THUMBV7-NEXT:    mov.w r7, #0
+; THUMBV7-NEXT:    adc.w r11, r4, r1
+; THUMBV7-NEXT:    umull lr, r4, r9, r2
+; THUMBV7-NEXT:    umlal r4, r7, r12, r2
+; THUMBV7-NEXT:    umull r1, r9, r9, r3
+; THUMBV7-NEXT:    adds.w r8, r1, r4
+; THUMBV7-NEXT:    mov.w r4, #0
+; THUMBV7-NEXT:    adcs.w r1, r7, r9
+; THUMBV7-NEXT:    umull r7, r6, r2, r10
+; THUMBV7-NEXT:    adc r4, r4, #0
+; THUMBV7-NEXT:    mla r2, r2, r5, r6
+; THUMBV7-NEXT:    umlal r1, r4, r12, r3
+; THUMBV7-NEXT:    mla r2, r3, r10, r2
+; THUMBV7-NEXT:    adds r1, r1, r7
+; THUMBV7-NEXT:    adc.w r3, r4, r2
+; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; THUMBV7-NEXT:    adds r2, r2, r1
+; THUMBV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r3, r1
+; THUMBV7-NEXT:    ldr r1, [sp] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r1, r1, #0
+; THUMBV7-NEXT:    adc r7, r11, #0
+; THUMBV7-NEXT:    mov r11, lr
+; THUMBV7-NEXT:  .LBB0_6: @ %overflow.res
+; THUMBV7-NEXT:    orrs r1, r7
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w r11, #1
-; THUMBV7-NEXT:    adds r2, r2, r5
-; THUMBV7-NEXT:    and.w r3, r3, lr
-; THUMBV7-NEXT:    str.w r2, [r9, #8]
-; THUMBV7-NEXT:    adcs r1, r7
-; THUMBV7-NEXT:    str.w r1, [r9, #12]
-; THUMBV7-NEXT:    orr.w r1, r3, r11
-; THUMBV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV7-NEXT:    orr.w r1, r1, r4
-; THUMBV7-NEXT:    orr.w r1, r1, r2
-; THUMBV7-NEXT:    orr.w r0, r0, r1
-; THUMBV7-NEXT:    adc r1, r12, #0
-; THUMBV7-NEXT:    orrs r0, r1
-; THUMBV7-NEXT:    and r0, r0, #1
-; THUMBV7-NEXT:    strb.w r0, [r9, #16]
-; THUMBV7-NEXT:    add sp, #44
+; THUMBV7-NEXT:    movne r1, #1
+; THUMBV7-NEXT:    b .LBB0_8
+; THUMBV7-NEXT:  .LBB0_7: @ %overflow.no
+; THUMBV7-NEXT:    umull r1, lr, r2, r9
+; THUMBV7-NEXT:    movs r4, #0
+; THUMBV7-NEXT:    umlal lr, r4, r3, r9
+; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
+; THUMBV7-NEXT:    movs r1, #0
+; THUMBV7-NEXT:    umull r7, r8, r2, r12
+; THUMBV7-NEXT:    adds.w r7, r7, lr
+; THUMBV7-NEXT:    str r7, [sp] @ 4-byte Spill
+; THUMBV7-NEXT:    adcs.w r7, r4, r8
+; THUMBV7-NEXT:    ldr r4, [sp, #60]
+; THUMBV7-NEXT:    adc r8, r1, #0
+; THUMBV7-NEXT:    umlal r7, r8, r3, r12
+; THUMBV7-NEXT:    umull r12, lr, r9, r11
+; THUMBV7-NEXT:    mla r6, r9, r6, lr
+; THUMBV7-NEXT:    str.w r12, [sp, #4] @ 4-byte Spill
+; THUMBV7-NEXT:    mla r12, r4, r11, r6
+; THUMBV7-NEXT:    ldr.w r11, [sp, #8] @ 4-byte Reload
+; THUMBV7-NEXT:    umull lr, r6, r10, r2
+; THUMBV7-NEXT:    mla r3, r10, r3, r6
+; THUMBV7-NEXT:    mla r2, r5, r2, r3
+; THUMBV7-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    adds.w r3, r3, lr
+; THUMBV7-NEXT:    adc.w r6, r2, r12
+; THUMBV7-NEXT:    adds r2, r7, r3
+; THUMBV7-NEXT:    adc.w r3, r8, r6
+; THUMBV7-NEXT:    ldr.w r8, [sp] @ 4-byte Reload
+; THUMBV7-NEXT:  .LBB0_8: @ %overflow.res
+; THUMBV7-NEXT:    strd r11, r8, [r0]
+; THUMBV7-NEXT:    and r1, r1, #1
+; THUMBV7-NEXT:    strd r2, r3, [r0, #8]
+; THUMBV7-NEXT:    strb r1, [r0, #16]
+; THUMBV7-NEXT:    add sp, #12
 ; THUMBV7-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
index 55e917159fce9..997868766d1dd 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
@@ -3,15 +3,19 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-LABEL: mulodi_test:
-; THUMBV7:       @ %bb.0: @ %start
+; THUMBV7:       @ %bb.0: @ %overflow.entry
 ; THUMBV7-NEXT:    .save {r4, r5, r7, lr}
 ; THUMBV7-NEXT:    push {r4, r5, r7, lr}
-; THUMBV7-NEXT:    umull r12, lr, r3, r0
+; THUMBV7-NEXT:    cbz r1, .LBB0_3
+; THUMBV7-NEXT:  @ %bb.1: @ %overflow.lhs
+; THUMBV7-NEXT:    cbz r3, .LBB0_5
+; THUMBV7-NEXT:  @ %bb.2: @ %overflow
+; THUMBV7-NEXT:    umull lr, r4, r3, r0
 ; THUMBV7-NEXT:    cmp r3, #0
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r3, #1
 ; THUMBV7-NEXT:    cmp r1, #0
-; THUMBV7-NEXT:    umull r0, r4, r0, r2
+; THUMBV7-NEXT:    umull r0, r12, r0, r2
 ; THUMBV7-NEXT:    umull r2, r5, r1, r2
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
@@ -20,15 +24,44 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r5, #1
 ; THUMBV7-NEXT:    orrs r1, r5
-; THUMBV7-NEXT:    cmp.w lr, #0
+; THUMBV7-NEXT:    cmp r4, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w lr, #1
-; THUMBV7-NEXT:    orr.w r3, r1, lr
-; THUMBV7-NEXT:    add.w r1, r2, r12
+; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    orr.w r3, r1, r4
+; THUMBV7-NEXT:    add.w r1, r2, lr
 ; THUMBV7-NEXT:    movs r2, #0
-; THUMBV7-NEXT:    adds r1, r1, r4
+; THUMBV7-NEXT:    adds.w r1, r1, r12
 ; THUMBV7-NEXT:    adc r2, r2, #0
-; THUMBV7-NEXT:    orrs r2, r3
+; THUMBV7-NEXT:    orr.w r12, r3, r2
+; THUMBV7-NEXT:    and r2, r12, #1
+; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
+; THUMBV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
+; THUMBV7-NEXT:    mov r5, r0
+; THUMBV7-NEXT:    umull r0, r4, r0, r2
+; THUMBV7-NEXT:    cbz r3, .LBB0_7
+; THUMBV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
+; THUMBV7-NEXT:    mul r12, r1, r3
+; THUMBV7-NEXT:    mla r1, r1, r2, r4
+; THUMBV7-NEXT:    umlal r1, r12, r5, r3
+; THUMBV7-NEXT:    b .LBB0_6
+; THUMBV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
+; THUMBV7-NEXT:    mov lr, r0
+; THUMBV7-NEXT:    umull r0, r4, r2, r0
+; THUMBV7-NEXT:    mov r5, r1
+; THUMBV7-NEXT:    mul r12, r3, r1
+; THUMBV7-NEXT:    mla r1, r3, lr, r4
+; THUMBV7-NEXT:    umlal r1, r12, r2, r5
+; THUMBV7-NEXT:  .LBB0_6: @ %overflow.res
+; THUMBV7-NEXT:    cmp.w r12, #0
+; THUMBV7-NEXT:    it ne
+; THUMBV7-NEXT:    movne.w r12, #1
+; THUMBV7-NEXT:    and r2, r12, #1
+; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
+; THUMBV7-NEXT:  .LBB0_7: @ %overflow.no
+; THUMBV7-NEXT:    mla r3, r5, r3, r4
+; THUMBV7-NEXT:    mov.w r12, #0
+; THUMBV7-NEXT:    mla r1, r1, r2, r3
+; THUMBV7-NEXT:    and r2, r12, #1
 ; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
 start:
   %0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %l, i64 %r) #2
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index e101c702e6409..2d236cce94c30 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -6,60 +6,181 @@
 ; This used to call muloti4, but that won't link with libgcc.
 define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
 ; CHECK-LABEL: x:
-; CHECK:       ## %bb.0: ## %entry
+; CHECK:       ## %bb.0: ## %overflow.entry
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    .cfi_offset %rbx, -24
 ; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movq %rdx, %r9
-; CHECK-NEXT:    movq %rsi, %r8
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rdi, %r8
+; CHECK-NEXT:    sarq $63, %r8
+; CHECK-NEXT:    cmpq %r8, %rsi
+; CHECK-NEXT:    je LBB0_5
+; CHECK-NEXT:  ## %bb.1: ## %overflow.lhs
+; CHECK-NEXT:    cmpq %rax, %rcx
+; CHECK-NEXT:    je LBB0_2
+; CHECK-NEXT:  ## %bb.7: ## %overflow1
 ; CHECK-NEXT:    movq %rsi, %rbx
 ; CHECK-NEXT:    sarq $63, %rbx
 ; CHECK-NEXT:    imulq %rdx, %rbx
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rdx
 ; CHECK-NEXT:    movq %rdx, %r10
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    mulq %r9
+; CHECK-NEXT:    mulq %rdx
 ; CHECK-NEXT:    movq %rdx, %r9
+; CHECK-NEXT:    movq %rax, %r8
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    mulq %r10
+; CHECK-NEXT:    movq %rdx, %r10
 ; CHECK-NEXT:    movq %rax, %r11
-; CHECK-NEXT:    addq %r10, %r11
-; CHECK-NEXT:    adcq %rbx, %r9
-; CHECK-NEXT:    movq %r9, %rbx
+; CHECK-NEXT:    addq %r9, %r11
+; CHECK-NEXT:    adcq %rbx, %r10
+; CHECK-NEXT:    movq %r10, %rbx
 ; CHECK-NEXT:    sarq $63, %rbx
-; CHECK-NEXT:    movq %rcx, %r14
-; CHECK-NEXT:    sarq $63, %r14
-; CHECK-NEXT:    imulq %rdi, %r14
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rdi, %r14
+; CHECK-NEXT:    imulq %rax, %r14
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    movq %rdx, %r10
+; CHECK-NEXT:    movq %rdx, %r9
 ; CHECK-NEXT:    movq %rax, %rdi
 ; CHECK-NEXT:    addq %r11, %rdi
-; CHECK-NEXT:    adcq %r14, %r10
-; CHECK-NEXT:    movq %r10, %r11
+; CHECK-NEXT:    adcq %r14, %r9
+; CHECK-NEXT:    movq %r9, %r11
 ; CHECK-NEXT:    sarq $63, %r11
-; CHECK-NEXT:    addq %r9, %r10
+; CHECK-NEXT:    addq %r10, %r9
 ; CHECK-NEXT:    adcq %rbx, %r11
-; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    imulq %rcx
-; CHECK-NEXT:    addq %r10, %rax
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    addq %r9, %rcx
 ; CHECK-NEXT:    adcq %r11, %rdx
-; CHECK-NEXT:    movq %rdi, %rcx
-; CHECK-NEXT:    sarq $63, %rcx
-; CHECK-NEXT:    xorq %rcx, %rdx
-; CHECK-NEXT:    xorq %rax, %rcx
-; CHECK-NEXT:    orq %rdx, %rcx
-; CHECK-NEXT:    jne LBB0_1
-; CHECK-NEXT:  ## %bb.2: ## %nooverflow
+; CHECK-NEXT:    movq %rdi, %rsi
+; CHECK-NEXT:    sarq $63, %rdi
+; CHECK-NEXT:    xorq %rdi, %rdx
+; CHECK-NEXT:    xorq %rcx, %rdi
+; CHECK-NEXT:    orq %rdx, %rdi
+; CHECK-NEXT:    jmp LBB0_8
+; CHECK-NEXT:  LBB0_5: ## %overflow.no.lhs
+; CHECK-NEXT:    cmpq %rax, %rcx
+; CHECK-NEXT:    je LBB0_6
+; CHECK-NEXT:  ## %bb.4: ## %overflow.no.lhs.only
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rsi, %r9
+; CHECK-NEXT:    xorq %rax, %r9
+; CHECK-NEXT:    movq %rdi, %r8
+; CHECK-NEXT:    xorq %rax, %r8
+; CHECK-NEXT:    subq %rax, %r8
+; CHECK-NEXT:    sbbq %rax, %r9
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    sets %r10b
+; CHECK-NEXT:    cmovnsq %rsi, %r9
+; CHECK-NEXT:    cmovnsq %rdi, %r8
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rcx, %rsi
+; CHECK-NEXT:    xorq %rax, %rsi
+; CHECK-NEXT:    movq %rdx, %rdi
+; CHECK-NEXT:    xorq %rax, %rdi
+; CHECK-NEXT:    subq %rax, %rdi
+; CHECK-NEXT:    sbbq %rax, %rsi
+; CHECK-NEXT:    testq %rcx, %rcx
+; CHECK-NEXT:    sets %r11b
+; CHECK-NEXT:    cmovnsq %rcx, %rsi
+; CHECK-NEXT:    cmovnsq %rdx, %rdi
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %rdi
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    imulq %r9, %rdi
+; CHECK-NEXT:    addq %rdx, %rdi
+; CHECK-NEXT:    imulq %rsi, %r9
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %rsi
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    addq %rdi, %rsi
+; CHECK-NEXT:    adcq %r9, %rdx
+; CHECK-NEXT:    xorb %r10b, %r11b
+; CHECK-NEXT:    movzbl %r11b, %ecx
+; CHECK-NEXT:    jmp LBB0_3
+; CHECK-NEXT:  LBB0_2: ## %overflow.no.rhs.only
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rcx, %r9
+; CHECK-NEXT:    xorq %rax, %r9
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    xorq %rax, %r8
+; CHECK-NEXT:    subq %rax, %r8
+; CHECK-NEXT:    sbbq %rax, %r9
+; CHECK-NEXT:    testq %rcx, %rcx
+; CHECK-NEXT:    sets %r10b
+; CHECK-NEXT:    cmovnsq %rcx, %r9
+; CHECK-NEXT:    cmovnsq %rdx, %r8
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    sarq $63, %rax
+; CHECK-NEXT:    movq %rsi, %r14
+; CHECK-NEXT:    xorq %rax, %r14
+; CHECK-NEXT:    movq %rdi, %r11
+; CHECK-NEXT:    xorq %rax, %r11
+; CHECK-NEXT:    subq %rax, %r11
+; CHECK-NEXT:    sbbq %rax, %r14
+; CHECK-NEXT:    testq %rsi, %rsi
+; CHECK-NEXT:    sets %bl
+; CHECK-NEXT:    cmovnsq %rsi, %r14
+; CHECK-NEXT:    cmovnsq %rdi, %r11
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %r11
+; CHECK-NEXT:    movq %rax, %rcx
+; CHECK-NEXT:    imulq %r9, %r11
+; CHECK-NEXT:    addq %rdx, %r11
+; CHECK-NEXT:    imulq %r14, %r9
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %r14
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    addq %r11, %rsi
+; CHECK-NEXT:    adcq %r9, %rdx
+; CHECK-NEXT:    xorb %r10b, %bl
+; CHECK-NEXT:    movzbl %bl, %ecx
+; CHECK-NEXT:  LBB0_3: ## %overflow.res
+; CHECK-NEXT:    movq %rcx, %rdi
+; CHECK-NEXT:    negq %rdi
+; CHECK-NEXT:    xorq %rdi, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    cmpq %rcx, %rax
+; CHECK-NEXT:    setb %r8b
+; CHECK-NEXT:    xorq %rdi, %rsi
+; CHECK-NEXT:    addq %r8, %rsi
+; CHECK-NEXT:    xorq %rdx, %rdi
+; CHECK-NEXT:    cmpq %r8, %rsi
+; CHECK-NEXT:    adcq $0, %rdi
+; CHECK-NEXT:  LBB0_8: ## %overflow.res
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    jne LBB0_10
+; CHECK-NEXT:  LBB0_11: ## %nooverflow
+; CHECK-NEXT:    movq %rsi, %rdx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  LBB0_1: ## %overflow
+; CHECK-NEXT:  LBB0_6: ## %overflow.no
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    mulq %rdx
+; CHECK-NEXT:    imulq %rcx, %rdi
+; CHECK-NEXT:    addq %rdx, %rdi
+; CHECK-NEXT:    imulq %r8, %rsi
+; CHECK-NEXT:    addq %rdi, %rsi
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    je LBB0_11
+; CHECK-NEXT:  LBB0_10: ## %overflow
 ; CHECK-NEXT:    ud2
 entry:
   %tmp16 = zext i64 %a.coerce0 to i128
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 13596e1b18768..1460a2564cc3e 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -4,64 +4,185 @@
 
 define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-LABEL: smuloi128:
-; X64:       ## %bb.0:
-; X64-NEXT:    pushq %r15
+; X64:       ## %bb.0: ## %overflow.entry
+; X64-NEXT:    pushq %rbp
 ; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r15
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    .cfi_offset %rbx, -32
-; X64-NEXT:    .cfi_offset %r14, -24
-; X64-NEXT:    .cfi_offset %r15, -16
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rsi, %r9
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:    .cfi_offset %rbx, -40
+; X64-NEXT:    .cfi_offset %r14, -32
+; X64-NEXT:    .cfi_offset %r15, -24
+; X64-NEXT:    .cfi_offset %rbp, -16
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rdi, %r9
+; X64-NEXT:    sarq $63, %r9
+; X64-NEXT:    cmpq %r9, %rsi
+; X64-NEXT:    je LBB0_5
+; X64-NEXT:  ## %bb.1: ## %overflow.lhs
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    je LBB0_2
+; X64-NEXT:  ## %bb.7: ## %overflow
 ; X64-NEXT:    movq %rsi, %r14
 ; X64-NEXT:    sarq $63, %r14
 ; X64-NEXT:    imulq %rdx, %r14
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rdx
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %rdx
 ; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r11, %rbx
-; X64-NEXT:    adcq %r14, %r10
-; X64-NEXT:    movq %r10, %r14
+; X64-NEXT:    addq %r10, %rbx
+; X64-NEXT:    adcq %r14, %r11
+; X64-NEXT:    movq %r11, %r14
 ; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    movq %rcx, %r15
-; X64-NEXT:    sarq $63, %r15
-; X64-NEXT:    imulq %rdi, %r15
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rdi, %r15
+; X64-NEXT:    imulq %rax, %r15
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %rbx, %rdi
-; X64-NEXT:    adcq %r15, %r11
-; X64-NEXT:    movq %r11, %rbx
+; X64-NEXT:    adcq %r15, %r10
+; X64-NEXT:    movq %r10, %rbx
 ; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    addq %r11, %r10
 ; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    imulq %rcx
-; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    addq %r10, %rax
 ; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    movq %rdi, 8(%r8)
+; X64-NEXT:    movq %rdi, %rsi
 ; X64-NEXT:    sarq $63, %rdi
 ; X64-NEXT:    xorq %rdi, %rdx
 ; X64-NEXT:    xorq %rax, %rdi
 ; X64-NEXT:    orq %rdx, %rdi
+; X64-NEXT:    jmp LBB0_8
+; X64-NEXT:  LBB0_5: ## %overflow.no.lhs
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    je LBB0_6
+; X64-NEXT:  ## %bb.4: ## %overflow.no.lhs.only
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    xorq %rax, %r11
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    xorq %rax, %r10
+; X64-NEXT:    subq %rax, %r10
+; X64-NEXT:    sbbq %rax, %r11
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    sets %bl
+; X64-NEXT:    cmovnsq %rsi, %r11
+; X64-NEXT:    cmovnsq %rdi, %r10
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rcx, %rsi
+; X64-NEXT:    xorq %rax, %rsi
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    xorq %rax, %rdi
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    sbbq %rax, %rsi
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    sets %bpl
+; X64-NEXT:    cmovnsq %rcx, %rsi
+; X64-NEXT:    cmovnsq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %r11, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %rsi, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    jmp LBB0_3
+; X64-NEXT:  LBB0_2: ## %overflow.no.rhs.only
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rcx, %r11
+; X64-NEXT:    xorq %rax, %r11
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    xorq %rax, %r10
+; X64-NEXT:    subq %rax, %r10
+; X64-NEXT:    sbbq %rax, %r11
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    sets %bl
+; X64-NEXT:    cmovnsq %rcx, %r11
+; X64-NEXT:    cmovnsq %rdx, %r10
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    xorq %rax, %r14
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    subq %rax, %rcx
+; X64-NEXT:    sbbq %rax, %r14
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    sets %bpl
+; X64-NEXT:    cmovnsq %rsi, %r14
+; X64-NEXT:    cmovnsq %rdi, %rcx
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %r11, %rcx
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    imulq %r14, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rcx, %rsi
+; X64-NEXT:  LBB0_3: ## %overflow.res
+; X64-NEXT:    adcq %r11, %rdx
+; X64-NEXT:    xorb %bl, %bpl
+; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorq %rcx, %r9
+; X64-NEXT:    addq %rax, %r9
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    cmpq %rax, %r9
+; X64-NEXT:    setb %dil
+; X64-NEXT:    xorq %rcx, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    xorq %rdx, %rcx
+; X64-NEXT:    cmpq %rdi, %rsi
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:  LBB0_8: ## %overflow.res
 ; X64-NEXT:    setne %al
-; X64-NEXT:    movq %rsi, (%r8)
+; X64-NEXT:    jmp LBB0_9
+; X64-NEXT:  LBB0_6: ## %overflow.no
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    mulq %rdx
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %r10, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:  LBB0_9: ## %overflow.res
+; X64-NEXT:    movq %r9, (%r8)
+; X64-NEXT:    movq %rsi, 8(%r8)
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: smuloi128:
-; X86:       ## %bb.0:
+; X86:       ## %bb.0: ## %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -70,196 +191,212 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $44, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 64
+; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 72
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    je LBB0_12
+; X86-NEXT:  ## %bb.1: ## %overflow.lhs
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je LBB0_2
+; X86-NEXT:  ## %bb.14: ## %overflow
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ebx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl (%esp), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -268,38 +405,435 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl %ebx, %ecx
 ; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    jmp LBB0_15
+; X86-NEXT:  LBB0_12: ## %overflow.no.lhs
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je LBB0_13
+; X86-NEXT:  ## %bb.7: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    subl %eax, %ebp
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_9
+; X86-NEXT:  ## %bb.8: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:  LBB0_9: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_11
+; X86-NEXT:  ## %bb.10: ## %overflow.no.lhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:  LBB0_11: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    imull %edi, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl %edx, %ebx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    jmp LBB0_15
+; X86-NEXT:  LBB0_2: ## %overflow.no.rhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    subl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_4
+; X86-NEXT:  ## %bb.3: ## %overflow.no.rhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:  LBB0_4: ## %overflow.no.rhs.only
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB0_6
+; X86-NEXT:  ## %bb.5: ## %overflow.no.rhs.only
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:  LBB0_6: ## %overflow.no.rhs.only
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl %edx, %ebx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    cmpl %edx, %ebp
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:  LBB0_15: ## %overflow.res
 ; X86-NEXT:    setne %al
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-  %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
-  %val = extractvalue {i128, i1} %t, 0
-  %obit = extractvalue {i128, i1} %t, 1
-  store i128 %val, ptr %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
-; X64-LABEL: smuloi256:
-; X64:       ## %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    pushq %r15
-; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 32
+; X86-NEXT:    jmp LBB0_16
+; X86-NEXT:  LBB0_13: ## %overflow.no
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %esi, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  LBB0_16: ## %overflow.res
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, 4(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
+  %val = extractvalue {i128, i1} %t, 0
+  %obit = extractvalue {i128, i1} %t, 1
+  store i128 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
+; X64-LABEL: smuloi256:
+; X64:       ## %bb.0: ## %overflow.entry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 32
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    .cfi_def_cfa_offset 40
 ; X64-NEXT:    pushq %r12
@@ -312,199 +846,558 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %r8, %r12
-; X64-NEXT:    movq %rcx, %rbx
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %rcx, %r12
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rsi, %r10
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rsi, %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    sarq $63, %rsi
+; X64-NEXT:    movq %r11, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    xorq %rcx, %rdx
+; X64-NEXT:    xorq %r8, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    je LBB1_4
+; X64-NEXT:  ## %bb.1: ## %overflow.lhs
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    xorq %rbx, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    je LBB1_2
+; X64-NEXT:  ## %bb.6: ## %overflow
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    adcq %rcx, %rsi
+; X64-NEXT:    addq %rdi, %r13
+; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rsi, %r14
-; X64-NEXT:    adcq %rcx, %r8
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    imulq %rcx, %rsi
+; X64-NEXT:    movzbl %al, %edi
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    addq %rax, %r15
-; X64-NEXT:    addq %rsi, %r15
-; X64-NEXT:    addq %rax, %r14
-; X64-NEXT:    adcq %r8, %r15
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    adcq %rdi, %rsi
+; X64-NEXT:    movq %r12, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    imulq %rdx, %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rdx
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    addq %rax, %rbp
+; X64-NEXT:    addq %rcx, %rbp
+; X64-NEXT:    addq %rax, %r10
+; X64-NEXT:    adcq %rsi, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    addq %rsi, %r12
-; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rcx, %r8
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    addq %r12, %rax
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    addq %r8, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rdi, %rbx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    adcq %rsi, %rdi
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rbx, %rsi
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    adcq %rax, %rbx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r13, %rbx
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %rbp
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r13, %rbp
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r15, %r12
-; X64-NEXT:    sarq $63, %r12
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rdi, %r9
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    addq %rsi, %r9
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    adcq %r8, %rsi
+; X64-NEXT:    setb %r9b
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdi, %r11
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r13
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %rsi, %r8
+; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    adcq %rax, %r13
+; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    sarq $63, %rsi
+; X64-NEXT:    imulq %rsi, %r11
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    addq %r11, %r9
+; X64-NEXT:    addq %rax, %r9
+; X64-NEXT:    addq %rax, %r8
+; X64-NEXT:    adcq %r13, %r9
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %rbp, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
+; X64-NEXT:    adcq %rbp, %r9
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    adcq %rax, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rax, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r15, %r11
+; X64-NEXT:    movq %r15, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; X64-NEXT:    imulq %rcx, %r11
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rax, %rcx
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    movq %r12, %r15
+; X64-NEXT:    imulq %rsi, %r12
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
+; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    adcq %r13, %rdi
-; X64-NEXT:    setb %r8b
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    addq %r12, %r11
+; X64-NEXT:    addq %rax, %r11
+; X64-NEXT:    addq %r14, %rbx
+; X64-NEXT:    adcq %rcx, %r11
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rsi, %r13
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %r13, %rsi
+; X64-NEXT:    adcq %r14, %r12
+; X64-NEXT:    setb %r14b
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    addq %r12, %rax
+; X64-NEXT:    movzbl %r14b, %r14d
+; X64-NEXT:    adcq %r14, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq %r11, %rdx
+; X64-NEXT:    addq %r8, %rcx
+; X64-NEXT:    adcq %r9, %rsi
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Folded Reload
+; X64-NEXT:    movq %r10, %r8
+; X64-NEXT:    sarq $63, %r8
+; X64-NEXT:    xorq %r8, %rax
+; X64-NEXT:    xorq %r8, %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    xorq %r8, %rdx
+; X64-NEXT:    xorq %rsi, %r8
+; X64-NEXT:    orq %rdx, %r8
+; X64-NEXT:    orq %rcx, %r8
+; X64-NEXT:    jmp LBB1_7
+; X64-NEXT:  LBB1_4: ## %overflow.no.lhs
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    xorq %rbx, %rsi
+; X64-NEXT:    orq %rcx, %rsi
+; X64-NEXT:    je LBB1_5
+; X64-NEXT:  ## %bb.3: ## %overflow.no.lhs.only
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    sarq $63, %rsi
+; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    xorq %rsi, %rcx
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq %r8, %rbp
+; X64-NEXT:    xorq %rsi, %rbp
+; X64-NEXT:    movq %r9, %rcx
+; X64-NEXT:    movq %r11, %r13
+; X64-NEXT:    xorq %rsi, %r13
+; X64-NEXT:    movq %rdi, %r10
+; X64-NEXT:    xorq %rsi, %r10
+; X64-NEXT:    subq %rsi, %r10
+; X64-NEXT:    sbbq %rsi, %r13
+; X64-NEXT:    sbbq %rsi, %rbp
+; X64-NEXT:    sbbq %rsi, %rdx
+; X64-NEXT:    testq %r12, %r12
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %r12, %rdx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    cmovnsq %r8, %rbp
+; X64-NEXT:    cmovnsq %r11, %r13
+; X64-NEXT:    cmovnsq %rdi, %r10
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movq %rbx, %r12
+; X64-NEXT:    xorq %rdx, %r12
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    xorq %rdx, %r14
+; X64-NEXT:    xorq %rdx, %r9
+; X64-NEXT:    movq %r15, %r11
+; X64-NEXT:    xorq %rdx, %r11
+; X64-NEXT:    subq %rdx, %r11
+; X64-NEXT:    sbbq %rdx, %r9
+; X64-NEXT:    sbbq %rdx, %r14
+; X64-NEXT:    sbbq %rdx, %r12
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %rbx, %r12
+; X64-NEXT:    cmovnsq %rax, %r14
+; X64-NEXT:    cmovnsq %rcx, %r9
+; X64-NEXT:    cmovnsq %r15, %r11
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    addq %rdi, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %r8d
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r8, %rsi
+; X64-NEXT:    imulq %rbp, %r9
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
+; X64-NEXT:    imulq %r15, %r11
+; X64-NEXT:    addq %rdx, %r11
+; X64-NEXT:    addq %r9, %r11
+; X64-NEXT:    addq %rdi, %rcx
+; X64-NEXT:    adcq %rsi, %r11
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdi, %r13
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %r9
-; X64-NEXT:    movq %r11, %rdi
-; X64-NEXT:    movq %r11, %r8
-; X64-NEXT:    sarq $63, %rdi
-; X64-NEXT:    imulq %rdi, %r10
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r10, %r11
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    addq %rax, %r13
-; X64-NEXT:    adcq %r9, %r11
-; X64-NEXT:    addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
-; X64-NEXT:    adcq %rbp, %rbx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r11, %rbp
-; X64-NEXT:    sarq $63, %rbp
-; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    adcq %r15, %r11
+; X64-NEXT:    addq %r8, %r9
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %r9, %r10
+; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %ebx
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %r8, %r9
+; X64-NEXT:    adcq %rbx, %rsi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    imulq %r15, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    imulq %rbp, %r12
+; X64-NEXT:    addq %r14, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    addq %r9, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    adcq %rsi, %r12
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r11, %r10
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X64-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X64-NEXT:    movzbl %cl, %edx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorq %rcx, %r15
+; X64-NEXT:    xorq %rcx, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    cmpq %rdx, %r14
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    sbbq $0, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    movzbl %dl, %edx
+; X64-NEXT:    xorq %rcx, %r10
+; X64-NEXT:    xorq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    xorq %rcx, %r12
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    adcq $0, %r12
+; X64-NEXT:    orq %rcx, %r12
+; X64-NEXT:    setne %al
+; X64-NEXT:    jmp LBB1_8
+; X64-NEXT:  LBB1_2: ## %overflow.no.rhs.only
+; X64-NEXT:    movq %rbx, %rdx
+; X64-NEXT:    sarq $63, %rdx
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    xorq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    xorq %rdx, %r14
+; X64-NEXT:    movq %r9, %r13
+; X64-NEXT:    xorq %rdx, %r13
+; X64-NEXT:    movq %r15, %r10
+; X64-NEXT:    xorq %rdx, %r10
+; X64-NEXT:    subq %rdx, %r10
+; X64-NEXT:    sbbq %rdx, %r13
+; X64-NEXT:    sbbq %rdx, %r14
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    testq %rbx, %rbx
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %rbx, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    cmovnsq %rax, %r14
+; X64-NEXT:    cmovnsq %r9, %r13
+; X64-NEXT:    cmovnsq %r15, %r10
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    adcq %rbp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movq %r12, %rbp
+; X64-NEXT:    xorq %rax, %rbp
+; X64-NEXT:    movq %r12, %rsi
+; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    xorq %rax, %r12
+; X64-NEXT:    movq %r11, %rbx
+; X64-NEXT:    xorq %rax, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    xorq %rax, %rdi
+; X64-NEXT:    subq %rax, %rdi
+; X64-NEXT:    sbbq %rax, %rbx
+; X64-NEXT:    sbbq %rax, %r12
+; X64-NEXT:    sbbq %rax, %rbp
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    cmovnsq %rsi, %rbp
+; X64-NEXT:    cmovnsq %r8, %r12
+; X64-NEXT:    cmovnsq %r11, %rbx
+; X64-NEXT:    cmovnsq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r12, %rbp
-; X64-NEXT:    movq %r8, %rbx
-; X64-NEXT:    imulq %rcx, %r8
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rax, %rsi
-; X64-NEXT:    addq %r8, %rsi
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
-; X64-NEXT:    imulq %r12, %rcx
+; X64-NEXT:    addq %rcx, %r9
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r9, %r8
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %r9d
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rcx, %r11
+; X64-NEXT:    adcq %r9, %rsi
+; X64-NEXT:    imulq %r14, %rbx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload
+; X64-NEXT:    addq %rbx, %r9
+; X64-NEXT:    addq %rdi, %r9
+; X64-NEXT:    addq %r11, %rcx
+; X64-NEXT:    adcq %rsi, %r9
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    addq %rbx, %r10
+; X64-NEXT:    adcq %rsi, %r11
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %r15d
+; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq %r15, %rsi
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload
+; X64-NEXT:    imulq %r14, %rbp
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    addq %rdx, %rbp
+; X64-NEXT:    addq %r12, %rbp
+; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    adcq %rsi, %rbp
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r9, %r10
+; X64-NEXT:    adcq $0, %rax
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X64-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X64-NEXT:    movzbl %cl, %edx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    negq %rcx
+; X64-NEXT:    xorq %rcx, %r15
+; X64-NEXT:    xorq %rcx, %r14
+; X64-NEXT:    addq %rdx, %r14
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    cmpq %rdx, %r14
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    sbbq $0, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    movzbl %dl, %edx
+; X64-NEXT:    xorq %rcx, %r10
+; X64-NEXT:    xorq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    xorq %rcx, %rbp
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    sbbq $0, %rax
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    orq %rcx, %rbp
+; X64-NEXT:  LBB1_7: ## %overflow.res
+; X64-NEXT:    setne %al
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    jmp LBB1_8
+; X64-NEXT:  LBB1_5: ## %overflow.no
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    imulq %r11, %rcx
 ; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rcx, %rdx
+; X64-NEXT:    imulq %rdi, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    imulq %r15, %r12
+; X64-NEXT:    addq %rdx, %r12
+; X64-NEXT:    imulq %r9, %r8
+; X64-NEXT:    addq %r12, %r8
+; X64-NEXT:    addq %rsi, %rcx
+; X64-NEXT:    adcq %rbx, %r8
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    addq %rcx, %r10
-; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    addq %r9, %r14
-; X64-NEXT:    adcq %rsi, %r10
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %rsi, %rbx
+; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    addq %rbx, %r15
+; X64-NEXT:    adcq %r10, %rsi
+; X64-NEXT:    setb %al
+; X64-NEXT:    movzbl %al, %ebx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %r15, %rdi
-; X64-NEXT:    adcq %r9, %r8
-; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    adcq %r10, %rdx
-; X64-NEXT:    addq %r13, %rsi
-; X64-NEXT:    adcq %r11, %rdi
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
-; X64-NEXT:    adcq %rbp, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
-; X64-NEXT:    movq %r8, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    xorq %rcx, %rax
-; X64-NEXT:    xorq %rcx, %rsi
-; X64-NEXT:    orq %rax, %rsi
-; X64-NEXT:    xorq %rcx, %rdx
-; X64-NEXT:    xorq %rdi, %rcx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    orq %rsi, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq %r8, 24(%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, (%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, 8(%rax)
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    movq %rcx, 16(%rax)
-; X64-NEXT:    setne %al
+; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    adcq %rbx, %r10
+; X64-NEXT:    addq %rcx, %rdi
+; X64-NEXT:    adcq %r8, %r10
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:  LBB1_8: ## %overflow.res
+; X64-NEXT:    movq %r14, (%r13)
+; X64-NEXT:    movq %r15, 8(%r13)
+; X64-NEXT:    movq %rdi, 16(%r13)
+; X64-NEXT:    movq %r10, 24(%r13)
+; X64-NEXT:    andb $1, %al
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
@@ -514,7 +1407,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: smuloi256:
-; X86:       ## %bb.0:
+; X86:       ## %bb.0: ## %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -529,334 +1422,1687 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    je LBB1_12
+; X86-NEXT:  ## %bb.1: ## %overflow.lhs
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    je LBB1_2
+; X86-NEXT:  ## %bb.14: ## %overflow
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    imull %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    jmp LBB1_15
+; X86-NEXT:  LBB1_12: ## %overflow.no.lhs
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    je LBB1_13
+; X86-NEXT:  ## %bb.7: ## %overflow.no.lhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB1_9
+; X86-NEXT:  ## %bb.8: ## %overflow.no.lhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:  LBB1_9: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB1_11
+; X86-NEXT:  ## %bb.10: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:  LBB1_11: ## %overflow.no.lhs.only
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl (%esp), %ebp ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl (%esp), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    imull %ebx, %ebp
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %esi
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl (%esp), %ebp ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    cmpl %esi, %ebp
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    jmp LBB1_15
+; X86-NEXT:  LBB1_2: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    subl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    js LBB1_4
+; X86-NEXT:  ## %bb.3: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:  LBB1_4: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    xorl %eax, %ebp
+; X86-NEXT:    xorl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    subl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    js LBB1_6
+; X86-NEXT:  ## %bb.5: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:  LBB1_6: ## %overflow.no.rhs.only
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
@@ -866,203 +3112,277 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    imull %ecx, %edx
+; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %ebx
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    movl (%esp), %ebx ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    cmpl %esi, %ebx
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:  LBB1_15: ## %overflow.res
+; X86-NEXT:    setne %al
+; X86-NEXT:    jmp LBB1_16
+; X86-NEXT:  LBB1_13: ## %overflow.no
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
@@ -1073,58 +3393,60 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
@@ -1133,15 +3455,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1150,136 +3472,151 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebp, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    imull %ecx, %edi
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %ecx, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %esi, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  LBB1_16: ## %overflow.res
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    xorl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    orl %ebp, %ecx
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    setne %al
+; X86-NEXT:    movl %edx, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %esi, 16(%ecx)
+; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 20(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 24(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl %edx, 28(%ecx)
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-NEXT:    addl $128, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 89afd1b00444b..4ccb90a37ca71 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -4,14 +4,19 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-LABEL: muloti_test:
-; X64:       # %bb.0: # %start
+; X64:       # %bb.0: # %overflow.entry
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    je .LBB0_3
+; X64-NEXT:  # %bb.1: # %overflow.lhs
 ; X64-NEXT:    testq %rcx, %rcx
-; X64-NEXT:    setne %dl
+; X64-NEXT:    je .LBB0_7
+; X64-NEXT:  # %bb.2: # %overflow
+; X64-NEXT:    setne %al
 ; X64-NEXT:    testq %rsi, %rsi
 ; X64-NEXT:    setne %r9b
-; X64-NEXT:    andb %dl, %r9b
+; X64-NEXT:    andb %al, %r9b
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    seto %r10b
@@ -26,10 +31,59 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    orb %r11b, %cl
+; X64-NEXT:    andb $1, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_3: # %overflow.no.lhs
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    testq %rcx, %rcx
+; X64-NEXT:    je .LBB0_8
+; X64-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %rsi, %r8
+; X64-NEXT:    addq %rdx, %r8
+; X64-NEXT:    imulq %rcx, %rsi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    addq %r8, %rdx
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    jmp .LBB0_5
+; X64-NEXT:  .LBB0_7: # %overflow.no.rhs.only
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %rsi, %rcx
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    adcq %rcx, %rsi
+; X64-NEXT:  .LBB0_5: # %overflow.no.lhs.only
+; X64-NEXT:    setne %cl
+; X64-NEXT:    andb $1, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_8: # %overflow.no
+; X64-NEXT:    imulq %rcx, %rdi
+; X64-NEXT:    addq %rdx, %rdi
+; X64-NEXT:    imulq %r8, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    andb $1, %cl
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: muloti_test:
-; X86:       # %bb.0: # %start
+; X86:       # %bb.0: # %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -38,116 +92,352 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $28, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 48
+; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 56
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # %bb.1: # %overflow.lhs
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.6: # %overflow
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    leal (%edi,%eax), %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    leal (%ecx,%eax), %esi
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    leal (%esi,%eax), %esi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    leal (%ebx,%eax), %ebx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    setne %ah
+; X86-NEXT:    andb %al, %ah
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    adcl %ebx, %edx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setne %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setne %ch
-; X86-NEXT:    andb %cl, %ch
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    orb %ah, %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    setne %cl
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %al
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    setne %ah
+; X86-NEXT:    andb %al, %ah
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    setne %ch
-; X86-NEXT:    andb %cl, %ch
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X86-NEXT:    orb %ch, %bl
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X86-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    setne %bh
 ; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setne %bl
+; X86-NEXT:    andb %ch, %bl
+; X86-NEXT:    orb %al, %bl
+; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_4: # %overflow.no.lhs
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    je .LBB0_5
+; X86-NEXT:  # %bb.3: # %overflow.no.lhs.only
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    orl %eax, %ebp
+; X86-NEXT:    setne %bl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_2: # %overflow.no.rhs.only
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    setne %bl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_5: # %overflow.no
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    imull %esi, %ebx
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    imull %edx, %ebp
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 4(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    movl %eax, 8(%ecx)
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    setne %al
-; X86-NEXT:    andb %bh, %al
-; X86-NEXT:    orb %bl, %al
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl %ebx, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:  .LBB0_7: # %overflow.res
+; X86-NEXT:    andb $1, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    movb %bl, 16(%eax)
+; X86-NEXT:    addl $36, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 132683cdb0f9e..99dc422a6b53e 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -3,7 +3,7 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-LABEL: mulodi_test:
-; X86:       # %bb.0: # %start
+; X86:       # %bb.0: # %overflow.entry
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -12,32 +12,89 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 24
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setne %dl
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  # %bb.1: # %overflow.lhs
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB0_2
+; X86-NEXT:  # %bb.6: # %overflow
+; X86-NEXT:    setne %al
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    andb %dl, %cl
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    andb %al, %cl
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    seto %ch
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    seto %bl
+; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    leal (%edi,%eax), %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bl, %ch
-; X86-NEXT:    orb %cl, %ch
-; X86-NEXT:    leal (%edi,%eax), %esi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    orb %bl, %cl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_4: # %overflow.no.lhs
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB0_5
+; X86-NEXT:  # %bb.3: # %overflow.no.lhs.only
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull %edi, %ebp
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    adcl %edi, %esi
+; X86-NEXT:    setne %cl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_2: # %overflow.no.rhs.only
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    imull %edi, %ebx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    setne %cl
+; X86-NEXT:    jmp .LBB0_7
+; X86-NEXT:  .LBB0_5: # %overflow.no
+; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    imull %ebp, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:  .LBB0_7: # %overflow.res
+; X86-NEXT:    andb $1, %cl
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index a076d0d762aa3..2601b73f26822 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -13,7 +13,7 @@ define {i64, i1} @t1() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t1:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    movl $72, %eax
 ; WIN32-NEXT:    xorl %edx, %edx
 ; WIN32-NEXT:    xorl %ecx, %ecx
@@ -30,7 +30,7 @@ define {i64, i1} @t2() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t2:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    xorl %eax, %eax
 ; WIN32-NEXT:    xorl %edx, %edx
 ; WIN32-NEXT:    xorl %ecx, %ecx
@@ -47,7 +47,7 @@ define {i64, i1} @t3() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t3:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    movl $-9, %eax
 ; WIN32-NEXT:    movl $-1, %edx
 ; WIN32-NEXT:    movb $1, %cl
@@ -204,59 +204,207 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    subl $16, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    sarl $31, %edx
 ; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    subl %edx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    je LBB6_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB6_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull %eax, %esi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    imull %ebx
 ; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    jmp LBB6_16
+; WIN32-NEXT:  LBB6_13: # %overflow.no.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB6_14
+; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_9
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:  LBB6_9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_11
+; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:  LBB6_11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; WIN32-NEXT:    jmp LBB6_12
+; WIN32-NEXT:  LBB6_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:  LBB6_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %ebx
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %edi
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB6_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:  LBB6_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %edi, %ebp
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
+; WIN32-NEXT:  LBB6_12: # %overflow.res
+; WIN32-NEXT:    movzbl %al, %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    negl %eax
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %edx, %eax
+; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    subl %ebx, %edx
+; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    jmp LBB6_16
+; WIN32-NEXT:  LBB6_14: # %overflow.no
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB6_16: # %overflow.res
+; WIN32-NEXT:    movl %ebp, (%esi)
+; WIN32-NEXT:    movl %ecx, 4(%esi)
+; WIN32-NEXT:    andb $1, %al
+; WIN32-NEXT:    # kill: def $al killed $al killed $eax
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -449,37 +597,93 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB10_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB10_2
+; WIN32-NEXT:  # %bb.7: # %overflow
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    leal (%edx,%eax), %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    orb %ch, %dl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    jmp LBB10_8
+; WIN32-NEXT:  LBB10_5: # %overflow.no.lhs
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB10_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB10_3
+; WIN32-NEXT:  LBB10_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:  LBB10_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    jmp LBB10_8
+; WIN32-NEXT:  LBB10_6: # %overflow.no
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:  LBB10_8: # %overflow.res
+; WIN32-NEXT:    movl %esi, (%ecx)
+; WIN32-NEXT:    movl %eax, 4(%ecx)
+; WIN32-NEXT:    andb $1, %dl
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -547,75 +751,224 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloselecti64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    subl %ecx, %edx
+; WIN32-NEXT:    je LBB12_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    subl %esi, %ecx
+; WIN32-NEXT:    je LBB12_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    imull %esi, %ecx
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    adcl %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    adcl %ecx, %ebx
 ; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %esi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %ebp, %edi
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    addl %esi, %edi
+; WIN32-NEXT:    adcl %ecx, %ebp
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    addl %ebx, %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
-; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull %ebx
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    adcl %esi, %edx
 ; WIN32-NEXT:    sarl $31, %edi
 ; WIN32-NEXT:    xorl %edi, %edx
 ; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    orl %edx, %edi
-; WIN32-NEXT:    jne LBB12_2
-; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    testb $1, %cl
+; WIN32-NEXT:    je LBB12_17
+; WIN32-NEXT:    jmp LBB12_18
+; WIN32-NEXT:  LBB12_13: # %overflow.no.lhs
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    subl %esi, %ecx
+; WIN32-NEXT:    je LBB12_14
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    xorl %ecx, %esi
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    xorl %ecx, %edi
+; WIN32-NEXT:    subl %ecx, %edi
+; WIN32-NEXT:    sbbl %ecx, %esi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_10
+; WIN32-NEXT:  # %bb.9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:  LBB12_10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    subl %eax, %ebp
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_12
+; WIN32-NEXT:  # %bb.11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:  LBB12_12: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %cl, %esi
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    negl %ecx
+; WIN32-NEXT:    xorl %ecx, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    subl %esi, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    xorl %ecx, %eax
+; WIN32-NEXT:    addl %edx, %eax
+; WIN32-NEXT:    xorl %edi, %ecx
+; WIN32-NEXT:    subl %edx, %eax
+; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    jmp LBB12_7
+; WIN32-NEXT:  LBB12_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
 ; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:  LBB12_2:
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    addl $4, %esp
-; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
-; WIN32-NEXT:    popl %ebx
-; WIN32-NEXT:    popl %ebp
-; WIN32-NEXT:    retl
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @umuloselecti32(i32 %v1, i32 %v2) {
-; LINUX-LABEL: umuloselecti32:
-; LINUX:       # %bb.0:
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %esi
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:  LBB12_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    subl %eax, %ebp
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB12_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:  LBB12_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %cl, %esi
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    negl %ecx
+; WIN32-NEXT:    xorl %ecx, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    subl %esi, %ebx
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    xorl %ecx, %eax
+; WIN32-NEXT:    addl %edx, %eax
+; WIN32-NEXT:    xorl %edi, %ecx
+; WIN32-NEXT:    subl %edx, %eax
+; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:  LBB12_7: # %overflow.res
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    testb $1, %cl
+; WIN32-NEXT:    jne LBB12_18
+; WIN32-NEXT:  LBB12_17: # %overflow.res
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:  LBB12_18: # %overflow.res
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB12_14: # %overflow.no
+; WIN32-NEXT:    xorl %ecx, %ecx
+; WIN32-NEXT:    testb $1, %cl
+; WIN32-NEXT:    je LBB12_17
+; WIN32-NEXT:    jmp LBB12_18
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @umuloselecti32(i32 %v1, i32 %v2) {
+; LINUX-LABEL: umuloselecti32:
+; LINUX:       # %bb.0:
 ; LINUX-NEXT:    movl %edi, %eax
 ; LINUX-NEXT:    mull %esi
 ; LINUX-NEXT:    cmovol %edi, %esi
@@ -670,45 +1023,86 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloselecti64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    je LBB14_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB14_2
+; WIN32-NEXT:  # %bb.7: # %overflow
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %al, %bl
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %al, %cl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    seto %bh
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    orb %bl, %bh
-; WIN32-NEXT:    addl %eax, %edi
-; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
+; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    addl %eax, %ebp
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    addl %ebp, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %bh, %al
-; WIN32-NEXT:    testb %al, %al
-; WIN32-NEXT:    jne LBB14_2
-; WIN32-NEXT:  # %bb.1:
+; WIN32-NEXT:    orb %ch, %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB14_9
+; WIN32-NEXT:    jmp LBB14_10
+; WIN32-NEXT:  LBB14_5: # %overflow.no.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB14_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %ebp, %edi
 ; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    imull %esi, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:    imull %eax, %ebp
+; WIN32-NEXT:    movl %eax, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    jmp LBB14_3
+; WIN32-NEXT:  LBB14_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    imull %edi, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %edi, %ebp
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:  LBB14_3: # %overflow.res
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    jne LBB14_10
+; WIN32-NEXT:  LBB14_9: # %overflow.res
+; WIN32-NEXT:    movl %edi, %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:  LBB14_2:
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:  LBB14_10: # %overflow.res
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    movl %esi, %edx
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
@@ -716,6 +1110,12 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
+; WIN32-NEXT:  LBB14_6: # %overflow.no
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB14_9
+; WIN32-NEXT:    jmp LBB14_10
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -952,35 +1352,47 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smulobri64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    subl $8, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    sarl $31, %edx
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    subl %edx, %esi
+; WIN32-NEXT:    je LBB18_12
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB18_2
+; WIN32-NEXT:  # %bb.14: # %overflow1
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %ebx, %ebp
-; WIN32-NEXT:    adcl %ecx, %edi
+; WIN32-NEXT:    adcl %esi, %edi
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %ecx, %edx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull %esi, %ecx
-; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %ecx
 ; WIN32-NEXT:    mull %edx
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %esi
@@ -989,7 +1401,7 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %ebx, %ebp
 ; WIN32-NEXT:    sarl $31, %ebp
 ; WIN32-NEXT:    addl %edi, %ebx
-; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %ebx, %eax
@@ -998,19 +1410,148 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    xorl %esi, %edx
 ; WIN32-NEXT:    xorl %eax, %esi
 ; WIN32-NEXT:    orl %edx, %esi
-; WIN32-NEXT:    jne LBB18_1
-; WIN32-NEXT:  # %bb.3: # %continue
+; WIN32-NEXT:    jmp LBB18_15
+; WIN32-NEXT:  LBB18_12: # %overflow.no.lhs
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB18_13
+; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %edi, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB18_9
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:  LBB18_9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB18_11
+; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:  LBB18_11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %bl, %edi
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    negl %esi
+; WIN32-NEXT:    xorl %esi, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    subl %edi, %ecx
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %esi, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    xorl %edx, %esi
+; WIN32-NEXT:    subl %ebx, %eax
+; WIN32-NEXT:    adcl $0, %esi
+; WIN32-NEXT:    jmp LBB18_15
+; WIN32-NEXT:  LBB18_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    js LBB18_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %ebx
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:  LBB18_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    subl %eax, %ebp
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB18_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %esi, %ebp
+; WIN32-NEXT:  LBB18_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ecx, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
+; WIN32-NEXT:    movzbl %cl, %edi
+; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    negl %ecx
+; WIN32-NEXT:    xorl %ecx, %esi
+; WIN32-NEXT:    addl %edi, %esi
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    subl %edi, %esi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %ecx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    xorl %edx, %ecx
+; WIN32-NEXT:    subl %ebx, %eax
+; WIN32-NEXT:    adcl $0, %ecx
+; WIN32-NEXT:  LBB18_15: # %overflow.res
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    jne LBB18_17
+; WIN32-NEXT:  LBB18_19: # %continue
 ; WIN32-NEXT:    movb $1, %al
-; WIN32-NEXT:  LBB18_2: # %overflow
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:  LBB18_18: # %overflow
+; WIN32-NEXT:    addl $8, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB18_1: # %overflow
+; WIN32-NEXT:  LBB18_13: # %overflow.no
 ; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    jmp LBB18_2
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB18_19
+; WIN32-NEXT:  LBB18_17: # %overflow
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    jmp LBB18_18
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -1261,46 +1802,90 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umulobri64:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB22_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB22_2
+; WIN32-NEXT:  # %bb.7: # %overflow1
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %dl
+; WIN32-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    orb %bl, %bh
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    leal (%ebp,%eax), %edi
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
-; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    addl %edi, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %ch, %al
-; WIN32-NEXT:    subb $1, %al
-; WIN32-NEXT:    je LBB22_1
-; WIN32-NEXT:  # %bb.3: # %continue
+; WIN32-NEXT:    orb %bh, %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB22_11
+; WIN32-NEXT:    jmp LBB22_9
+; WIN32-NEXT:  LBB22_5: # %overflow.no.lhs
+; WIN32-NEXT:    testl %edi, %edi
+; WIN32-NEXT:    je LBB22_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    imull %ebx, %ecx
+; WIN32-NEXT:    addl %edx, %ecx
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB22_3
+; WIN32-NEXT:  LBB22_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:  LBB22_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    jne LBB22_9
+; WIN32-NEXT:  LBB22_11: # %continue
 ; WIN32-NEXT:    movb $1, %al
-; WIN32-NEXT:  LBB22_2: # %overflow
+; WIN32-NEXT:  LBB22_10: # %overflow
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB22_1: # %overflow
+; WIN32-NEXT:  LBB22_6: # %overflow.no
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:    testb $1, %al
+; WIN32-NEXT:    je LBB22_11
+; WIN32-NEXT:  LBB22_9: # %overflow
 ; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    jmp LBB22_2
+; WIN32-NEXT:    jmp LBB22_10
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -1334,18 +1919,33 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: bug27873:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    movl $160, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl $160, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %edx
-; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %bl, %al
-; WIN32-NEXT:    orb {{[0-9]+}}(%esp), %al
+; WIN32-NEXT:    pushl %edi
+; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    je LBB23_2
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl $160, %ebx
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl $0, %edx
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    jmp LBB23_3
+; WIN32-NEXT:  LBB23_2: # %overflow.no.lhs
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB23_3: # %overflow.res
+; WIN32-NEXT:    orb %al, %cl
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    popl %esi
+; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    retl
   %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
@@ -1635,62 +2235,208 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64_load:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ecx
-; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    movl (%eax), %edi
+; WIN32-NEXT:    movl 4(%eax), %ecx
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    sarl $31, %edx
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    subl %edx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    je LBB30_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB30_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    imull %esi, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ebp, %edi
 ; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    imull %ebx
 ; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $12, %esp
+; WIN32-NEXT:    jmp LBB30_16
+; WIN32-NEXT:  LBB30_13: # %overflow.no.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB30_14
+; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_9
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:  LBB30_9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_11
+; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebx, %ecx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:  LBB30_11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ecx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; WIN32-NEXT:    jmp LBB30_12
+; WIN32-NEXT:  LBB30_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebp
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebx, %ebp
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:  LBB30_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB30_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:  LBB30_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl %edi, %ebp
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
+; WIN32-NEXT:  LBB30_12: # %overflow.res
+; WIN32-NEXT:    movzbl %al, %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    negl %eax
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %edx, %eax
+; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    subl %ebx, %edx
+; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    jmp LBB30_16
+; WIN32-NEXT:  LBB30_14: # %overflow.no
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    mull %edx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    addl %edx, %ebx
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB30_16: # %overflow.res
+; WIN32-NEXT:    movl %ebp, (%esi)
+; WIN32-NEXT:    movl %ecx, 4(%esi)
+; WIN32-NEXT:    andb $1, %al
+; WIN32-NEXT:    # kill: def $al killed $al killed $eax
+; WIN32-NEXT:    addl $16, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1728,61 +2474,206 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64_load2:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $12, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl (%ecx), %ebx
-; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl (%edx), %ebx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl 4(%edx), %ebp
+; WIN32-NEXT:    movl %ebp, %edx
+; WIN32-NEXT:    je LBB31_13
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB31_2
+; WIN32-NEXT:  # %bb.15: # %overflow
+; WIN32-NEXT:    movl %ecx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
 ; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebx
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    addl %ecx, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl 4(%eax), %ecx
-; WIN32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ebp, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %esi
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    addl %ebp, %eax
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ecx
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    adcl %edi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull (%esp) # 4-byte Folded Reload
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    addl %ebx, %edi
+; WIN32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    imull %ebp
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %esi, 4(%eax)
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, (%eax)
+; WIN32-NEXT:    movl %edi, %ecx
 ; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    jmp LBB31_16
+; WIN32-NEXT:  LBB31_13: # %overflow.no.lhs
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    je LBB31_14
+; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %esi
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_10
+; WIN32-NEXT:  # %bb.9: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    movl %edx, %edi
+; WIN32-NEXT:  LBB31_10: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    subl %eax, %edi
+; WIN32-NEXT:    sbbl %eax, %ecx
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_12
+; WIN32-NEXT:  # %bb.11: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:  LBB31_12: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %esi, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ecx, %esi
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %edi, %ecx
+; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; WIN32-NEXT:    jmp LBB31_7
+; WIN32-NEXT:  LBB31_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:    xorl %eax, %edx
+; WIN32-NEXT:    subl %eax, %edx
+; WIN32-NEXT:    sbbl %eax, %edi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_4
+; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    movl %ebx, %edx
+; WIN32-NEXT:  LBB31_4: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    sarl $31, %eax
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    xorl %eax, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:    xorl %eax, %esi
+; WIN32-NEXT:    subl %eax, %esi
+; WIN32-NEXT:    sbbl %eax, %ebx
+; WIN32-NEXT:    testl %ecx, %ecx
+; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
+; WIN32-NEXT:    js LBB31_6
+; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ecx, %ebx
+; WIN32-NEXT:    movl %edx, %esi
+; WIN32-NEXT:  LBB31_6: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    addl %esi, %ecx
+; WIN32-NEXT:    adcl %edi, %edx
+; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
+; WIN32-NEXT:  LBB31_7: # %overflow.res
+; WIN32-NEXT:    movzbl %al, %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    negl %eax
+; WIN32-NEXT:    xorl %eax, %ebp
+; WIN32-NEXT:    addl %esi, %ebp
+; WIN32-NEXT:    xorl %ebx, %ebx
+; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    subl %esi, %edi
+; WIN32-NEXT:    setb %bl
+; WIN32-NEXT:    xorl %eax, %ecx
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    xorl %edx, %eax
+; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    subl %ebx, %edx
+; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    jmp LBB31_16
+; WIN32-NEXT:  LBB31_14: # %overflow.no
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    imull %edi, %ebp
+; WIN32-NEXT:    addl %edx, %ebp
+; WIN32-NEXT:    imull %ebx, %ecx
+; WIN32-NEXT:    addl %ebp, %ecx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    xorl %eax, %eax
+; WIN32-NEXT:  LBB31_16: # %overflow.res
+; WIN32-NEXT:    movl %ebp, (%esi)
+; WIN32-NEXT:    movl %ecx, 4(%esi)
+; WIN32-NEXT:    andb $1, %al
+; WIN32-NEXT:    # kill: def $al killed $al killed $eax
 ; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
@@ -2133,38 +3024,94 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ebp
-; WIN32-NEXT:    movl 4(%eax), %eax
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    movl (%eax), %edi
+; WIN32-NEXT:    movl 4(%eax), %ebx
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB38_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB38_2
+; WIN32-NEXT:  # %bb.7: # %overflow
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    leal (%edx,%eax), %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    orb %ch, %dl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    jmp LBB38_8
+; WIN32-NEXT:  LBB38_5: # %overflow.no.lhs
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB38_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB38_3
+; WIN32-NEXT:  LBB38_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:  LBB38_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    jmp LBB38_8
+; WIN32-NEXT:  LBB38_6: # %overflow.no
+; WIN32-NEXT:    imull %ebp, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    addl %edi, %ebx
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:  LBB38_8: # %overflow.res
+; WIN32-NEXT:    movl %esi, (%ecx)
+; WIN32-NEXT:    movl %eax, 4(%ecx)
+; WIN32-NEXT:    andb $1, %dl
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -2210,38 +3157,94 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load2:
-; WIN32:       # %bb.0:
+; WIN32:       # %bb.0: # %overflow.entry
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl (%ecx), %ebp
-; WIN32-NEXT:    movl 4(%ecx), %esi
-; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%eax), %edi
+; WIN32-NEXT:    movl 4(%eax), %ebp
+; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    je LBB39_5
+; WIN32-NEXT:  # %bb.1: # %overflow.lhs
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB39_2
+; WIN32-NEXT:  # %bb.7: # %overflow
+; WIN32-NEXT:    setne %al
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %edi
+; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
+; WIN32-NEXT:    leal (%edx,%eax), %ebx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    setb %dl
+; WIN32-NEXT:    orb %ch, %dl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    jmp LBB39_8
+; WIN32-NEXT:  LBB39_5: # %overflow.no.lhs
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    je LBB39_6
+; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebx, %edi
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    imull %ebp, %ebx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %esi, %edx
-; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %eax, (%esi)
-; WIN32-NEXT:    movl %edx, 4(%esi)
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    adcl %ebx, %edx
+; WIN32-NEXT:    jmp LBB39_3
+; WIN32-NEXT:  LBB39_2: # %overflow.no.rhs.only
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %ebx, %ebp
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    addl %esi, %eax
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:  LBB39_3: # %overflow.res
+; WIN32-NEXT:    testl %edx, %edx
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    jmp LBB39_8
+; WIN32-NEXT:  LBB39_6: # %overflow.no
+; WIN32-NEXT:    imull %ebp, %esi
+; WIN32-NEXT:    addl %edx, %esi
+; WIN32-NEXT:    imull %edi, %ebx
+; WIN32-NEXT:    addl %esi, %ebx
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    xorl %edx, %edx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:  LBB39_8: # %overflow.res
+; WIN32-NEXT:    movl %esi, (%ecx)
+; WIN32-NEXT:    movl %eax, 4(%ecx)
+; WIN32-NEXT:    andb $1, %dl
+; WIN32-NEXT:    movl %edx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx

From f8b60eba57876195092fe90af028fc7f08c6a68b Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Fri, 1 Aug 2025 17:44:14 +0000
Subject: [PATCH 02/12] Resolve review comments: - Enable optimization for
 AArch64 only. - Optimize only when both LHS, RHS value range are within legal
 type. - Use a single Builder

Change-Id: I11d674440364594e4bca839495036975cd403aa5
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  506 +-
 llvm/test/CodeGen/AArch64/i128-math.ll        |  449 +-
 .../CodeGen/AArch64/i128_with_overflow.ll     |  171 +-
 .../umulo-128-legalisation-lowering.ll        |  145 +-
 .../ARM/umulo-128-legalisation-lowering.ll    |  579 +--
 .../ARM/umulo-64-legalisation-lowering.ll     |  107 +-
 .../CodeGen/LoongArch/smul-with-overflow.ll   |  985 +---
 .../umulo-128-legalisation-lowering.ll        |  439 +-
 .../RISCV/umulo-128-legalisation-lowering.ll  |  355 +-
 llvm/test/CodeGen/RISCV/xaluo.ll              | 2857 ++----------
 .../SPARC/smulo-128-legalisation-lowering.ll  | 1255 +----
 .../SPARC/umulo-128-legalisation-lowering.ll  |  605 +--
 .../Thumb/umulo-128-legalisation-lowering.ll  |  654 +--
 .../Thumb2/umulo-128-legalisation-lowering.ll |  294 +-
 .../Thumb2/umulo-64-legalisation-lowering.ll  |   51 +-
 llvm/test/CodeGen/X86/muloti.ll               |  177 +-
 .../X86/smulo-128-legalisation-lowering.ll    | 4101 ++++-------------
 .../X86/umulo-128-legalisation-lowering.ll    |  454 +-
 .../X86/umulo-64-legalisation-lowering.ll     |   85 +-
 llvm/test/CodeGen/X86/xmulo.ll                | 1597 ++-----
 20 files changed, 3026 insertions(+), 12840 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 238718e471e47..9db998f24482c 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6395,17 +6395,21 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
   return true;
 }
 
-// Rewrite the umul_with_overflow intrinsic by checking if any/both of the
+// Rewrite the umul_with_overflow intrinsic by checking if both of the
 // operands' value range is within the legal type. If so, we can optimize the
 // multiplication algorithm. This code is supposed to be written during the step
 // of type legalization, but given that we need to reconstruct the IR which is
 // not doable there, we do it here.
 bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
+  // Enable this optimization only for aarch64.
+  if (!TLI->getTargetMachine().getTargetTriple().isAArch64())
+    return false;
   if (TLI->getTypeAction(
           I->getContext(),
           TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
       TargetLowering::TypeExpandInteger)
     return false;
+
   Value *LHS = I->getOperand(0);
   Value *RHS = I->getOperand(1);
   auto *Ty = LHS->getType();
@@ -6413,206 +6417,63 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   unsigned VTHalfBitWidth = VTBitWidth / 2;
   auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
 
-  assert(
-      (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) ==
-       TargetLowering::TypeLegal) &&
-      "Expected the type to be legal for the target lowering");
+  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the target.
+  if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) != TargetLowering::TypeLegal)
+    return false;
 
   I->getParent()->setName("overflow.res");
   auto *OverflowResBB = I->getParent();
   auto *OverflowoEntryBB =
       I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
-  BasicBlock *OverflowLHSBB = BasicBlock::Create(
-      I->getContext(), "overflow.lhs", I->getFunction(), OverflowResBB);
-  BasicBlock *NoOverflowLHSBB = BasicBlock::Create(
-      I->getContext(), "overflow.no.lhs", I->getFunction(), OverflowResBB);
-  BasicBlock *NoOverflowRHSonlyBB = BasicBlock::Create(
-      I->getContext(), "overflow.no.rhs.only", I->getFunction(), OverflowResBB);
-  BasicBlock *NoOverflowLHSonlyBB = BasicBlock::Create(
-      I->getContext(), "overflow.no.lhs.only", I->getFunction(), OverflowResBB);
   BasicBlock *NoOverflowBB = BasicBlock::Create(
       I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
   BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
                                               I->getFunction(), OverflowResBB);
   // new blocks should be:
   //  entry:
-  //    lhs_lo ne lhs_hi ? overflow_yes_lhs, overflow_no_lhs
+  //    (lhs_lo ne lhs_hi) || (rhs_lo ne rhs_hi) ? overflow, overflow_no
 
-  //  overflow_yes_lhs:
-  //    rhs_lo ne rhs_hi ? overflow : overflow_no_rhs_only
-
-  //  overflow_no_lhs:
-  //    rhs_lo ne rhs_hi ? overflow_no_lhs_only : overflow_no
-
-  //  overflow_no_rhs_only:
-  //  overflow_no_lhs_only:
   //  overflow_no:
   //  overflow:
   //  overflow.res:
-
-  IRBuilder<> BuilderEntryBB(OverflowoEntryBB->getTerminator());
-  IRBuilder<> BuilderOverflowLHSBB(OverflowLHSBB);
-  IRBuilder<> BuilderNoOverflowLHSBB(NoOverflowLHSBB);
-  IRBuilder<> BuilderNoOverflowRHSonlyBB(NoOverflowRHSonlyBB);
-  IRBuilder<> BuilderNoOverflowLHSonlyBB(NoOverflowLHSonlyBB);
-  IRBuilder<> BuilderNoOverflowBB(NoOverflowBB);
-  IRBuilder<> BuilderOverflowResBB(OverflowResBB,
-                                   OverflowResBB->getFirstInsertionPt());
-
   //------------------------------------------------------------------------------
   // BB overflow.entry:
   // get Lo and Hi of RHS & LHS:
+  IRBuilder<> Builder(OverflowoEntryBB->getTerminator());
+  auto *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs.trunc");
+  auto *ShrHiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  auto *HiRHS = Builder.CreateTrunc(ShrHiRHS, LegalTy, "hi.rhs.trunc");
 
-  auto *LoRHS = BuilderEntryBB.CreateTrunc(RHS, LegalTy, "lo.rhs.trunc");
-  auto *ShrHiRHS = BuilderEntryBB.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
-  auto *HiRHS = BuilderEntryBB.CreateTrunc(ShrHiRHS, LegalTy, "hi.rhs.trunc");
-
-  auto *LoLHS = BuilderEntryBB.CreateTrunc(LHS, LegalTy, "lo.lhs.trunc");
-  auto *ShrHiLHS = BuilderEntryBB.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
-  auto *HiLHS = BuilderEntryBB.CreateTrunc(ShrHiLHS, LegalTy, "hi.lhs.trunc");
+  auto *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs.trunc");
+  auto *ShrHiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  auto *HiLHS = Builder.CreateTrunc(ShrHiLHS, LegalTy, "hi.lhs.trunc");
 
-  auto *Cmp = BuilderEntryBB.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
+  auto *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
                                        ConstantInt::getNullValue(LegalTy));
-  BuilderEntryBB.CreateCondBr(Cmp, OverflowLHSBB, NoOverflowLHSBB);
-  OverflowoEntryBB->getTerminator()->eraseFromParent();
-
-  //------------------------------------------------------------------------------
-  // BB overflow_yes_lhs:
-  Cmp = BuilderOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
-                                       ConstantInt::getNullValue(LegalTy));
-  BuilderOverflowLHSBB.CreateCondBr(Cmp, OverflowBB, NoOverflowRHSonlyBB);
-
-  //------------------------------------------------------------------------------
-  // BB overflow_no_lhs:
-  Cmp = BuilderNoOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+  auto *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
                                          ConstantInt::getNullValue(LegalTy));
-  BuilderNoOverflowLHSBB.CreateCondBr(Cmp, NoOverflowLHSonlyBB, NoOverflowBB);
-
-  //------------------------------------------------------------------------------
-  // BB overflow_no_rhs_only:
-  // RHS is 64 value range, LHS is 128
-  // P0 = RHS * LoLHS
-  // P1 = RHS * HiLHS
-
-  LoLHS = BuilderNoOverflowRHSonlyBB.CreateZExt(LoLHS, Ty, "lo.lhs");
-
-  // P0 = (RHS * LoLHS)
-  auto *P0 = BuilderNoOverflowRHSonlyBB.CreateMul(RHS, LoLHS,
-                                                  "mul.no.overflow.rhs.lolhs");
-  auto *P0Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.rhs");
-  auto *P0Hi =
-      BuilderNoOverflowRHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.rhs.lsr");
-  P0Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.rhs");
-
-  // P1 = (RHS * HiLHS)
-  auto *P1 = BuilderNoOverflowRHSonlyBB.CreateMul(RHS, ShrHiLHS,
-                                                  "mul.no.overflow.rhs.hilhs");
-  auto *P1Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.rhs");
-  auto *P1Hi =
-      BuilderNoOverflowRHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.rhs.lsr");
-  P1Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.rhs");
-
-  auto *AddOverflow = BuilderNoOverflowRHSonlyBB.CreateIntrinsic(
-      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
-  auto *AddOResMid = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
-      AddOverflow, 0, "rhs.p0.p1.res");
-  auto *Carry = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
-      AddOverflow, 1, "rhs.p0.p1.carry");
-  Carry =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
-  auto *ResHi =
-      BuilderNoOverflowRHSonlyBB.CreateAdd(P1Hi, Carry, "rhs.p1.carry");
-
-  auto *ResLoEx =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(P0Lo, Ty, "rhs.res_lo.zext");
-  auto *ResMid =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(AddOResMid, Ty, "rhs.res_mid.zext");
-  auto *ResMidShl = BuilderNoOverflowRHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
-                                                         "rhs.res_mid.shl");
-  auto *FinalRes = BuilderNoOverflowRHSonlyBB.CreateOr(ResLoEx, ResMidShl,
-                                                       "rhs.res_lo.or.mid");
-  auto *IsOverflow = BuilderNoOverflowRHSonlyBB.CreateICmp(
-      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
-      "rhs.check.overflow");
-
-  StructType *STy = StructType::get(
-      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
-  Value *StructValNoOverflowRHS = PoisonValue::get(STy);
-  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
-      StructValNoOverflowRHS, FinalRes, {0});
-  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
-      StructValNoOverflowRHS, IsOverflow, {1});
-  BuilderNoOverflowRHSonlyBB.CreateBr(OverflowResBB);
-  //------------------------------------------------------------------------------
-
-  // BB overflow_no_lhs_only:
-
-  LoRHS = BuilderNoOverflowLHSonlyBB.CreateZExt(LoRHS, Ty, "lo.rhs");
-
-  // P0 = (LHS * LoRHS)
-  P0 = BuilderNoOverflowLHSonlyBB.CreateMul(LHS, LoRHS,
-                                            "mul.no.overflow.lhs.lorhs");
-  P0Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.lhs");
-  P0Hi =
-      BuilderNoOverflowLHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.lsr.lhs");
-  P0Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.lhs");
-
-  // P1 = (LHS * HiRHS)
-  P1 = BuilderNoOverflowLHSonlyBB.CreateMul(LHS, ShrHiRHS,
-                                            "mul.no.overflow.lhs.hirhs");
-  P1Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.lhs");
-  P1Hi =
-      BuilderNoOverflowLHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.lhs.lsr");
-  P1Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.lhs");
-
-  AddOverflow = BuilderNoOverflowLHSonlyBB.CreateIntrinsic(
-      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
-  AddOResMid = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 0,
-                                                             "lhs.p0.p1.res");
-  Carry = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 1,
-                                                        "lhs.p0.p1.carry");
-  Carry =
-      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
-  ResHi = BuilderNoOverflowLHSonlyBB.CreateAdd(P1Hi, Carry, "lhs.p1.carry");
-
-  ResLoEx = BuilderNoOverflowLHSonlyBB.CreateZExt(P0Lo, Ty, "lhs.res_lo.zext");
-  ResMid =
-      BuilderNoOverflowLHSonlyBB.CreateZExt(AddOResMid, Ty, "lhs.res_mid.zext");
-  ResMidShl = BuilderNoOverflowLHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
-                                                   "lhs.res_mid.shl");
-  FinalRes = BuilderNoOverflowLHSonlyBB.CreateOr(ResLoEx, ResMidShl,
-                                                 "lhs.res_lo.or.mid");
-  IsOverflow = BuilderNoOverflowLHSonlyBB.CreateICmp(
-      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
-      "lhs.check.overflow");
-
-  STy = StructType::get(I->getContext(),
-                        {Ty, IntegerType::getInt1Ty(I->getContext())});
-  Value *StructValNoOverflowLHS = PoisonValue::get(STy);
-  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
-      StructValNoOverflowLHS, FinalRes, {0});
-  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
-      StructValNoOverflowLHS, IsOverflow, {1});
+  auto *Or = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
+  Builder.CreateCondBr(Or, OverflowBB, NoOverflowBB);
+  OverflowoEntryBB->getTerminator()->eraseFromParent();
 
-  BuilderNoOverflowLHSonlyBB.CreateBr(OverflowResBB);
   //------------------------------------------------------------------------------
-
   // BB overflow.no:
-  auto *Mul = BuilderNoOverflowBB.CreateMul(LHS, RHS, "mul.no.overflow");
-  STy = StructType::get(I->getContext(),
+  Builder.SetInsertPoint(NoOverflowBB);
+  auto *Mul = Builder.CreateMul(LHS, RHS, "mul.no.overflow");
+  StructType *STy = StructType::get(I->getContext(),
                         {Ty, IntegerType::getInt1Ty(I->getContext())});
   Value *StructValNoOverflow = PoisonValue::get(STy);
   StructValNoOverflow =
-      BuilderNoOverflowBB.CreateInsertValue(StructValNoOverflow, Mul, {0});
-  StructValNoOverflow = BuilderNoOverflowBB.CreateInsertValue(
+  Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
+  StructValNoOverflow = Builder.CreateInsertValue(
       StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
-  BuilderNoOverflowBB.CreateBr(OverflowResBB);
+      Builder.CreateBr(OverflowResBB);
 
+  //------------------------------------------------------------------------------
   // BB overflow.res:
-  auto *PHINode = BuilderOverflowResBB.CreatePHI(STy, 2);
+  Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
+  auto *PHINode = Builder.CreatePHI(STy, 2);
   PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
-  PHINode->addIncoming(StructValNoOverflowLHS, NoOverflowLHSonlyBB);
-  PHINode->addIncoming(StructValNoOverflowRHS, NoOverflowRHSonlyBB);
 
   // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
   // uses by PHINode.
@@ -6622,23 +6483,28 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   PHINode->addIncoming(I, OverflowBB);
   I->removeFromParent();
   I->insertInto(OverflowBB, OverflowBB->end());
-  IRBuilder<>(OverflowBB, OverflowBB->end()).CreateBr(OverflowResBB);
+  Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
+  Builder.CreateBr(OverflowResBB);
 
   // return false to stop reprocessing the function.
   return false;
 }
 
-// Rewrite the smul_with_overflow intrinsic by checking if any/both of the
+// Rewrite the smul_with_overflow intrinsic by checking if both of the
 // operands' value range is within the legal type. If so, we can optimize the
 // multiplication algorithm. This code is supposed to be written during the step
 // of type legalization, but given that we need to reconstruct the IR which is
 // not doable there, we do it here.
 bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
+  // Enable this optimization only for aarch64.
+  if (!TLI->getTargetMachine().getTargetTriple().isAArch64())
+    return false;
   if (TLI->getTypeAction(
           I->getContext(),
           TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
       TargetLowering::TypeExpandInteger)
     return false;
+
   Value *LHS = I->getOperand(0);
   Value *RHS = I->getOperand(1);
   auto *Ty = LHS->getType();
@@ -6646,307 +6512,66 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   unsigned VTHalfBitWidth = VTBitWidth / 2;
   auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
 
-  assert(
-      (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) ==
-       TargetLowering::TypeLegal) &&
-      "Expected the type to be legal for the target lowering");
+  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the target.
+  if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) != TargetLowering::TypeLegal)
+    return false;
 
   I->getParent()->setName("overflow.res");
   auto *OverflowResBB = I->getParent();
   auto *OverflowoEntryBB =
       I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
-  BasicBlock *OverflowLHSBB = BasicBlock::Create(
-      I->getContext(), "overflow.lhs", I->getFunction(), OverflowResBB);
-  BasicBlock *NoOverflowLHSBB = BasicBlock::Create(
-      I->getContext(), "overflow.no.lhs", I->getFunction(), OverflowResBB);
-  BasicBlock *NoOverflowRHSonlyBB = BasicBlock::Create(
-      I->getContext(), "overflow.no.rhs.only", I->getFunction(), OverflowResBB);
-  BasicBlock *NoOverflowLHSonlyBB = BasicBlock::Create(
-      I->getContext(), "overflow.no.lhs.only", I->getFunction(), OverflowResBB);
   BasicBlock *NoOverflowBB = BasicBlock::Create(
       I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
   BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
                                               I->getFunction(), OverflowResBB);
   // new blocks should be:
   //  entry:
-  //    lhs_lo ne lhs_hi ? overflow_yes_lhs, overflow_no_lhs
-
-  //  overflow_yes_lhs:
-  //    rhs_lo ne rhs_hi ? overflow : overflow_no_rhs_only
+  //    (lhs_lo ne lhs_hi) || (rhs_lo ne rhs_hi) ? overflow, overflow_no
 
-  //  overflow_no_lhs:
-  //    rhs_lo ne rhs_hi ? overflow_no_lhs_only : overflow_no
-
-  //  overflow_no_rhs_only:
-  //  overflow_no_lhs_only:
   //  overflow_no:
   //  overflow:
   //  overflow.res:
 
-  IRBuilder<> BuilderEntryBB(OverflowoEntryBB->getTerminator());
-  IRBuilder<> BuilderOverflowLHSBB(OverflowLHSBB);
-  IRBuilder<> BuilderNoOverflowLHSBB(NoOverflowLHSBB);
-  IRBuilder<> BuilderNoOverflowRHSonlyBB(NoOverflowRHSonlyBB);
-  IRBuilder<> BuilderNoOverflowLHSonlyBB(NoOverflowLHSonlyBB);
-  IRBuilder<> BuilderNoOverflowBB(NoOverflowBB);
-  IRBuilder<> BuilderOverflowResBB(OverflowResBB,
-                                   OverflowResBB->getFirstInsertionPt());
-
   //------------------------------------------------------------------------------
   // BB overflow.entry:
   // get Lo and Hi of RHS & LHS:
-
-  auto *LoRHS = BuilderEntryBB.CreateTrunc(RHS, LegalTy, "lo.rhs");
+  IRBuilder<> Builder(OverflowoEntryBB->getTerminator());
+  auto *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs");
   auto *SignLoRHS =
-      BuilderEntryBB.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
-  auto *HiRHS = BuilderEntryBB.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
-  HiRHS = BuilderEntryBB.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
+  Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+  auto *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
 
-  auto *LoLHS = BuilderEntryBB.CreateTrunc(LHS, LegalTy, "lo.lhs");
+  auto *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
   auto *SignLoLHS =
-      BuilderEntryBB.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
-  auto *HiLHS = BuilderEntryBB.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
-  HiLHS = BuilderEntryBB.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
-
-  auto *Cmp = BuilderEntryBB.CreateCmp(ICmpInst::ICMP_NE, HiLHS, SignLoLHS);
-  BuilderEntryBB.CreateCondBr(Cmp, OverflowLHSBB, NoOverflowLHSBB);
+  Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+  auto *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
+
+  auto *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS, SignLoLHS);
+  auto *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
+  auto *Or = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
+  Builder.CreateCondBr(Or, OverflowBB, NoOverflowBB);
   OverflowoEntryBB->getTerminator()->eraseFromParent();
 
   //------------------------------------------------------------------------------
-  // BB overflow_yes_lhs:
-  Cmp = BuilderOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
-  BuilderOverflowLHSBB.CreateCondBr(Cmp, OverflowBB, NoOverflowRHSonlyBB);
-
-  //------------------------------------------------------------------------------
-  // BB overflow_no_lhs:
-  Cmp = BuilderNoOverflowLHSBB.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
-  BuilderNoOverflowLHSBB.CreateCondBr(Cmp, NoOverflowLHSonlyBB, NoOverflowBB);
-
-  //------------------------------------------------------------------------------
-  // BB overflow_no_rhs_only:
-  // RHS is within 64 value range, LHS is 128
-  // P0 = RHS * LoLHS
-  // P1 = RHS * HiLHS
-
-  // check sign of RHS:
-  auto *IsNegRHS = BuilderNoOverflowRHSonlyBB.CreateIsNeg(RHS, "rhs.isneg");
-  auto *AbsRHSIntr = BuilderNoOverflowRHSonlyBB.CreateBinaryIntrinsic(
-      Intrinsic::abs, RHS, ConstantInt::getFalse(I->getContext()), {},
-      "abs.rhs");
-  auto *AbsRHS = BuilderNoOverflowRHSonlyBB.CreateSelect(
-      IsNegRHS, AbsRHSIntr, RHS, "lo.abs.rhs.select");
-
-  // check sign of LHS:
-  auto *IsNegLHS = BuilderNoOverflowRHSonlyBB.CreateIsNeg(LHS, "lhs.isneg");
-  auto *AbsLHSIntr = BuilderNoOverflowRHSonlyBB.CreateBinaryIntrinsic(
-      Intrinsic::abs, LHS, ConstantInt::getFalse(I->getContext()), {},
-      "abs.lhs");
-  auto *AbsLHS = BuilderNoOverflowRHSonlyBB.CreateSelect(IsNegLHS, AbsLHSIntr,
-                                                         LHS, "abs.lhs.select");
-  LoLHS = BuilderNoOverflowRHSonlyBB.CreateAnd(
-      AbsLHS,
-      ConstantInt::get(Ty, APInt::getLowBitsSet(VTBitWidth, VTHalfBitWidth)),
-      "lo.abs.lhs");
-  HiLHS = BuilderNoOverflowRHSonlyBB.CreateLShr(AbsLHS, VTHalfBitWidth,
-                                                "hi.abs.lhs");
-
-  // P0 = (RHS * LoLHS)
-  auto *P0 = BuilderNoOverflowRHSonlyBB.CreateMul(AbsRHS, LoLHS,
-                                                  "mul.no.overflow.rhs.lolhs");
-  auto *P0Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.rhs");
-  auto *P0Hi =
-      BuilderNoOverflowRHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.rhs.lsr");
-  P0Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.rhs");
-
-  // P1 = (RHS * HiLHS)
-  auto *P1 = BuilderNoOverflowRHSonlyBB.CreateMul(AbsRHS, HiLHS,
-                                                  "mul.no.overflow.rhs.hilhs");
-  auto *P1Lo = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.rhs");
-  auto *P1Hi =
-      BuilderNoOverflowRHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.rhs.lsr");
-  P1Hi = BuilderNoOverflowRHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.rhs");
-
-  auto *AddOverflow = BuilderNoOverflowRHSonlyBB.CreateIntrinsic(
-      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
-  auto *AddOResMid = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
-      AddOverflow, 0, "rhs.p0.p1.res");
-  auto *Carry = BuilderNoOverflowRHSonlyBB.CreateExtractValue(
-      AddOverflow, 1, "rhs.p0.p1.carry");
-  Carry =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
-  auto *ResHi =
-      BuilderNoOverflowRHSonlyBB.CreateAdd(P1Hi, Carry, "rhs.p1.carry");
-
-  // sign handling:
-  auto *IsNeg = BuilderNoOverflowRHSonlyBB.CreateXor(IsNegRHS, IsNegLHS); // i1
-  auto *Mask =
-      BuilderNoOverflowRHSonlyBB.CreateSExt(IsNeg, LegalTy, "rhs.sign.mask");
-  auto *Add_1 =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(IsNeg, LegalTy, "rhs.add.1");
-  auto *ResLo =
-      BuilderNoOverflowRHSonlyBB.CreateXor(P0Lo, Mask, "rhs.res_lo.xor.mask");
-  ResLo =
-      BuilderNoOverflowRHSonlyBB.CreateAdd(ResLo, Add_1, "rhs.res_lo.add.1");
-
-  Carry = BuilderNoOverflowRHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResLo, Add_1,
-                                               "rhs.check.res_lo.carry");
-  Carry =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
-  auto *ResMid = BuilderNoOverflowRHSonlyBB.CreateXor(AddOResMid, Mask,
-                                                      "rhs.res_mid.xor.mask");
-  ResMid =
-      BuilderNoOverflowRHSonlyBB.CreateAdd(ResMid, Carry, "rhs.res_mid.carry");
-
-  Carry = BuilderNoOverflowRHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResMid,
-                                               Carry, "rhs.check.reslo.carry");
-  Carry =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(Carry, LegalTy, "rhs.carry.zext");
-  ResHi =
-      BuilderNoOverflowRHSonlyBB.CreateXor(ResHi, Mask, "rhs.res_hi.xor.mask");
-  ResHi =
-      BuilderNoOverflowRHSonlyBB.CreateAdd(ResHi, Carry, "rhs.res_hi.carry");
-  // set the final result:
-  auto *ResLoEx =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(ResLo, Ty, "rhs.res_lo.zext");
-  ResMid =
-      BuilderNoOverflowRHSonlyBB.CreateZExt(ResMid, Ty, "rhs.res_mid.zext");
-  auto *ResMidShl = BuilderNoOverflowRHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
-                                                         "rhs.res_mid.shl");
-  auto *FinalRes = BuilderNoOverflowRHSonlyBB.CreateOr(ResLoEx, ResMidShl,
-                                                       "rhs.res_lo.or.mid");
-  auto *IsOverflow = BuilderNoOverflowRHSonlyBB.CreateICmp(
-      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
-      "rhs.check.overflow");
-
-  StructType *STy = StructType::get(
-      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
-  Value *StructValNoOverflowRHS = PoisonValue::get(STy);
-  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
-      StructValNoOverflowRHS, FinalRes, {0});
-  StructValNoOverflowRHS = BuilderNoOverflowRHSonlyBB.CreateInsertValue(
-      StructValNoOverflowRHS, IsOverflow, {1});
-  BuilderNoOverflowRHSonlyBB.CreateBr(OverflowResBB);
-  //------------------------------------------------------------------------------
-
-  // BB overflow_no_lhs_only:
-  // LHS (64), RHS is 128
-  // P0 = LHS * LoRHS
-  // P1 = LHS * HiRHS
-
-  // check sign of LHS:
-  IsNegLHS = BuilderNoOverflowLHSonlyBB.CreateIsNeg(LHS, "lhs.isneg");
-  AbsLHSIntr = BuilderNoOverflowLHSonlyBB.CreateBinaryIntrinsic(
-      Intrinsic::abs, LHS, ConstantInt::getFalse(I->getContext()), {},
-      "abs.lhs");
-  AbsLHS = BuilderNoOverflowLHSonlyBB.CreateSelect(IsNegLHS, AbsLHSIntr, LHS,
-                                                   "abs.lhs.select");
-
-  // check sign of RHS:
-  IsNegRHS = BuilderNoOverflowLHSonlyBB.CreateIsNeg(RHS, "rhs.isneg");
-  AbsRHSIntr = BuilderNoOverflowLHSonlyBB.CreateBinaryIntrinsic(
-      Intrinsic::abs, RHS, ConstantInt::getFalse(I->getContext()), {},
-      "abs.rhs");
-  AbsRHS = BuilderNoOverflowLHSonlyBB.CreateSelect(IsNegRHS, AbsRHSIntr, RHS,
-                                                   "abs.rhs.select");
-
-  LoRHS = BuilderNoOverflowLHSonlyBB.CreateAnd(
-      AbsRHS,
-      ConstantInt::get(Ty, APInt::getLowBitsSet(VTBitWidth, VTHalfBitWidth)),
-      "lo.abs.rhs");
-  HiRHS = BuilderNoOverflowLHSonlyBB.CreateLShr(AbsRHS, VTHalfBitWidth,
-                                                "hi.abs.rhs");
-
-  // P0 = (LHS * LoRHS)
-  P0 = BuilderNoOverflowLHSonlyBB.CreateMul(AbsLHS, LoRHS,
-                                            "mul.no.overflow.lhs.lorhs");
-  P0Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0, LegalTy, "p0.lo.lhs");
-  P0Hi =
-      BuilderNoOverflowLHSonlyBB.CreateLShr(P0, VTHalfBitWidth, "p0.lsr.lhs");
-  P0Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P0Hi, LegalTy, "p0.hi.lhs");
-
-  // P1 = (LHS * HiRHS)
-  P1 = BuilderNoOverflowLHSonlyBB.CreateMul(AbsLHS, HiRHS,
-                                            "mul.no.overflow.lhs.hirhs");
-  P1Lo = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1, LegalTy, "p1.lo.lhs");
-  P1Hi =
-      BuilderNoOverflowLHSonlyBB.CreateLShr(P1, VTHalfBitWidth, "p1.lhs.lsr");
-  P1Hi = BuilderNoOverflowLHSonlyBB.CreateTrunc(P1Hi, LegalTy, "p1.hi.lhs");
-
-  AddOverflow = BuilderNoOverflowLHSonlyBB.CreateIntrinsic(
-      Intrinsic::uadd_with_overflow, LegalTy, {P0Hi, P1Lo});
-  AddOResMid = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 0,
-                                                             "lhs.p0.p1.res");
-  Carry = BuilderNoOverflowLHSonlyBB.CreateExtractValue(AddOverflow, 1,
-                                                        "lhs.p0.p1.carry");
-  Carry =
-      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
-  ResHi = BuilderNoOverflowLHSonlyBB.CreateAdd(P1Hi, Carry, "lhs.p1.carry");
-
-  // sign handling:
-  IsNeg = BuilderNoOverflowLHSonlyBB.CreateXor(IsNegRHS, IsNegLHS); // i1
-  Mask = BuilderNoOverflowLHSonlyBB.CreateSExt(IsNeg, LegalTy, "lhs.sign.mask");
-  Add_1 = BuilderNoOverflowLHSonlyBB.CreateZExt(IsNeg, LegalTy, "lhs.add.1");
-  ResLo =
-      BuilderNoOverflowLHSonlyBB.CreateXor(P0Lo, Mask, "lhs.res_lo.xor.mask");
-  ResLo =
-      BuilderNoOverflowLHSonlyBB.CreateAdd(ResLo, Add_1, "lhs.res_lo.add.1");
-
-  Carry = BuilderNoOverflowLHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResLo, Add_1,
-                                               "lhs.check.res_lo.carry");
-  Carry =
-      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
-  ResMid = BuilderNoOverflowLHSonlyBB.CreateXor(AddOResMid, Mask,
-                                                "lhs.res_mid.xor.mask");
-  ResMid =
-      BuilderNoOverflowLHSonlyBB.CreateAdd(ResMid, Carry, "lhs.res_mid.carry");
-
-  Carry = BuilderNoOverflowLHSonlyBB.CreateCmp(ICmpInst::ICMP_ULT, ResMid,
-                                               Carry, "lhs.check.reslo.carry");
-  Carry =
-      BuilderNoOverflowLHSonlyBB.CreateZExt(Carry, LegalTy, "lhs.carry.zext");
-  ResHi =
-      BuilderNoOverflowLHSonlyBB.CreateXor(ResHi, Mask, "lhs.res_hi.xor.mask");
-  ResHi =
-      BuilderNoOverflowLHSonlyBB.CreateAdd(ResHi, Carry, "lhs.res_hi.carry");
-  // Set the final result:
-  ResLoEx = BuilderNoOverflowLHSonlyBB.CreateZExt(ResLo, Ty, "lhs.res_lo.zext");
-  ResMid =
-      BuilderNoOverflowLHSonlyBB.CreateZExt(ResMid, Ty, "lhs.res_mid.zext");
-  ResMidShl = BuilderNoOverflowLHSonlyBB.CreateShl(ResMid, VTHalfBitWidth,
-                                                   "lhs.res_mid.shl");
-  FinalRes = BuilderNoOverflowLHSonlyBB.CreateOr(ResLoEx, ResMidShl,
-                                                 "lhs.res_lo.or.mid");
-  IsOverflow = BuilderNoOverflowLHSonlyBB.CreateICmp(
-      ICmpInst::ICMP_NE, ResHi, Constant::getNullValue(LegalTy),
-      "lhs.check.overflow");
-
-  STy = StructType::get(I->getContext(),
-                        {Ty, IntegerType::getInt1Ty(I->getContext())});
-  Value *StructValNoOverflowLHS = PoisonValue::get(STy);
-  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
-      StructValNoOverflowLHS, FinalRes, {0});
-  StructValNoOverflowLHS = BuilderNoOverflowLHSonlyBB.CreateInsertValue(
-      StructValNoOverflowLHS, IsOverflow, {1});
-
-  BuilderNoOverflowLHSonlyBB.CreateBr(OverflowResBB);
-  //------------------------------------------------------------------------------
-
   // BB overflow.no:
-  auto *Mul = BuilderNoOverflowBB.CreateMul(LHS, RHS, "mul.no.overflow");
-  STy = StructType::get(I->getContext(),
+  Builder.SetInsertPoint(NoOverflowBB);
+  auto *Mul = Builder.CreateMul(LHS, RHS, "mul.no.overflow");
+  StructType * STy = StructType::get(I->getContext(),
                         {Ty, IntegerType::getInt1Ty(I->getContext())});
   Value *StructValNoOverflow = PoisonValue::get(STy);
   StructValNoOverflow =
-      BuilderNoOverflowBB.CreateInsertValue(StructValNoOverflow, Mul, {0});
-  StructValNoOverflow = BuilderNoOverflowBB.CreateInsertValue(
+  Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
+  StructValNoOverflow = Builder.CreateInsertValue(
       StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
-  BuilderNoOverflowBB.CreateBr(OverflowResBB);
+      Builder.CreateBr(OverflowResBB);
 
+  //------------------------------------------------------------------------------
   // BB overflow.res:
-  auto *PHINode = BuilderOverflowResBB.CreatePHI(STy, 2);
+  Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
+  auto *PHINode = Builder.CreatePHI(STy, 2);
   PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
-  PHINode->addIncoming(StructValNoOverflowLHS, NoOverflowLHSonlyBB);
-  PHINode->addIncoming(StructValNoOverflowRHS, NoOverflowRHSonlyBB);
 
   // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
   // uses by PHINode.
@@ -6956,7 +6581,8 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   PHINode->addIncoming(I, OverflowBB);
   I->removeFromParent();
   I->insertInto(OverflowBB, OverflowBB->end());
-  IRBuilder<>(OverflowBB, OverflowBB->end()).CreateBr(OverflowResBB);
+  Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
+  Builder.CreateBr(OverflowResBB);
 
   // return false to stop reprocessing the function.
   return false;
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index e2791f44d0a08..5c7aa3e62ec1b 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -262,10 +262,9 @@ define i128 @u128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_checked_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cbz x1, .LBB17_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
-; CHECK-NEXT:    cbz x3, .LBB17_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB17_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
@@ -279,35 +278,14 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    b .LBB17_8
-; CHECK-NEXT:  .LBB17_3: // %overflow.no.lhs
+; CHECK-NEXT:    b .LBB17_3
+; CHECK-NEXT:  .LBB17_2: // %overflow.no
 ; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    cbz x3, .LBB17_7
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    madd x8, x1, x2, x8
-; CHECK-NEXT:    umulh x9, x0, x3
-; CHECK-NEXT:    mul x10, x0, x3
-; CHECK-NEXT:    mul x11, x1, x3
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    b .LBB17_6
-; CHECK-NEXT:  .LBB17_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    umulh x8, x2, x0
-; CHECK-NEXT:    umulh x9, x2, x1
-; CHECK-NEXT:    madd x8, x3, x0, x8
-; CHECK-NEXT:    mul x10, x2, x1
-; CHECK-NEXT:    mul x11, x3, x1
-; CHECK-NEXT:    mul x0, x2, x0
-; CHECK-NEXT:  .LBB17_6: // %overflow.res
-; CHECK-NEXT:    adds x1, x8, x10
-; CHECK-NEXT:    adcs xzr, x9, x11
-; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    b .LBB17_8
-; CHECK-NEXT:  .LBB17_7: // %overflow.no
 ; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    madd x1, x1, x2, x8
 ; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:  .LBB17_8: // %overflow.res
+; CHECK-NEXT:  .LBB17_3: // %overflow.res
 ; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    bic w2, w9, w8
 ; CHECK-NEXT:    ret
@@ -324,10 +302,9 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_overflowing_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cbz x1, .LBB18_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
-; CHECK-NEXT:    cbz x3, .LBB18_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB18_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
@@ -343,30 +320,8 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
 ; CHECK-NEXT:    and w2, w8, #0x1
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB18_3: // %overflow.no.lhs
+; CHECK-NEXT:  .LBB18_2: // %overflow.no
 ; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    cbz x3, .LBB18_7
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    madd x8, x1, x2, x8
-; CHECK-NEXT:    umulh x9, x0, x3
-; CHECK-NEXT:    mul x10, x0, x3
-; CHECK-NEXT:    mul x11, x1, x3
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    b .LBB18_6
-; CHECK-NEXT:  .LBB18_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    umulh x8, x2, x0
-; CHECK-NEXT:    umulh x9, x2, x1
-; CHECK-NEXT:    madd x8, x3, x0, x8
-; CHECK-NEXT:    mul x10, x2, x1
-; CHECK-NEXT:    mul x11, x3, x1
-; CHECK-NEXT:    mul x0, x2, x0
-; CHECK-NEXT:  .LBB18_6: // %overflow.res
-; CHECK-NEXT:    adds x1, x8, x10
-; CHECK-NEXT:    adcs xzr, x9, x11
-; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    and w2, w8, #0x1
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB18_7: // %overflow.no
 ; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    madd x1, x1, x2, x8
@@ -384,10 +339,9 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_saturating_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cbz x1, .LBB19_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
-; CHECK-NEXT:    cbz x3, .LBB19_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB19_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x8, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
@@ -401,35 +355,14 @@ define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cset w10, ne
 ; CHECK-NEXT:    adds x9, x12, x11
 ; CHECK-NEXT:    csinc w10, w10, wzr, lo
-; CHECK-NEXT:    b .LBB19_8
-; CHECK-NEXT:  .LBB19_3: // %overflow.no.lhs
+; CHECK-NEXT:    b .LBB19_3
+; CHECK-NEXT:  .LBB19_2: // %overflow.no
 ; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    cbz x3, .LBB19_7
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    madd x9, x1, x2, x8
-; CHECK-NEXT:    umulh x10, x0, x3
-; CHECK-NEXT:    mul x11, x0, x3
-; CHECK-NEXT:    mul x12, x1, x3
-; CHECK-NEXT:    mul x8, x0, x2
-; CHECK-NEXT:    b .LBB19_6
-; CHECK-NEXT:  .LBB19_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    umulh x8, x2, x0
-; CHECK-NEXT:    umulh x10, x2, x1
-; CHECK-NEXT:    madd x9, x3, x0, x8
-; CHECK-NEXT:    mul x11, x2, x1
-; CHECK-NEXT:    mul x12, x3, x1
-; CHECK-NEXT:    mul x8, x2, x0
-; CHECK-NEXT:  .LBB19_6: // %overflow.res
-; CHECK-NEXT:    adds x9, x9, x11
-; CHECK-NEXT:    adcs xzr, x10, x12
-; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    b .LBB19_8
-; CHECK-NEXT:  .LBB19_7: // %overflow.no
-; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    madd x9, x1, x2, x8
 ; CHECK-NEXT:    mul x8, x0, x2
-; CHECK-NEXT:  .LBB19_8: // %overflow.res
+; CHECK-NEXT:  .LBB19_3: // %overflow.res
 ; CHECK-NEXT:    tst w10, #0x1
 ; CHECK-NEXT:    csinv x0, x8, xzr, eq
 ; CHECK-NEXT:    csinv x1, x9, xzr, eq
@@ -456,13 +389,20 @@ define i128 @i128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_checked_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.eq .LBB21_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    b.ne .LBB21_3
+; CHECK-NEXT:  // %bb.1: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB21_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    b.ne .LBB21_3
+; CHECK-NEXT:  // %bb.2: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB21_4
+; CHECK-NEXT:  .LBB21_3: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -473,103 +413,24 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
 ; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
 ; CHECK-NEXT:    mul x13, x0, x13
 ; CHECK-NEXT:    asr x11, x8, #63
-; CHECK-NEXT:    adds x9, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x9
-; CHECK-NEXT:    adc x9, x12, x13
-; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adds x8, x8, x9
-; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    adc x11, x11, x12
 ; CHECK-NEXT:    adds x8, x15, x8
-; CHECK-NEXT:    adc x10, x10, x11
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    b .LBB21_7
-; CHECK-NEXT:  .LBB21_3: // %overflow.no.lhs
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB21_8
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    asr x8, x1, #63
-; CHECK-NEXT:    asr x10, x3, #63
-; CHECK-NEXT:    eor x9, x0, x8
-; CHECK-NEXT:    eor x11, x1, x8
-; CHECK-NEXT:    eor x12, x2, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    eor x11, x3, x10
-; CHECK-NEXT:    csel x8, x8, x1, lt
-; CHECK-NEXT:    csel x9, x9, x0, lt
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    csel x11, x12, x2, lt
-; CHECK-NEXT:    csel x10, x10, x3, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w12, w13
-; CHECK-NEXT:    b .LBB21_6
-; CHECK-NEXT:  .LBB21_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    asr x8, x3, #63
-; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    eor x9, x2, x8
-; CHECK-NEXT:    eor x11, x3, x8
-; CHECK-NEXT:    eor x12, x0, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    eor x11, x1, x10
-; CHECK-NEXT:    csel x8, x8, x3, lt
-; CHECK-NEXT:    csel x9, x9, x2, lt
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    csel x11, x12, x0, lt
-; CHECK-NEXT:    csel x10, x10, x1, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w13, w12
-; CHECK-NEXT:  .LBB21_6: // %overflow.res
-; CHECK-NEXT:    sbfx x12, x10, #0, #1
-; CHECK-NEXT:    adds x8, x8, x11
-; CHECK-NEXT:    adc x9, x9, x15
-; CHECK-NEXT:    eor x13, x14, x12
-; CHECK-NEXT:    eor x8, x8, x12
-; CHECK-NEXT:    add x0, x13, x10
-; CHECK-NEXT:    cmp x0, x10
-; CHECK-NEXT:    cset w10, lo
-; CHECK-NEXT:    cinc x1, x8, lo
-; CHECK-NEXT:    eor x8, x9, x12
-; CHECK-NEXT:    cmp x1, x10
-; CHECK-NEXT:    cinc x8, x8, lo
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:  .LBB21_7: // %overflow.res
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    b .LBB21_9
-; CHECK-NEXT:  .LBB21_8: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:  .LBB21_9: // %overflow.res
+; CHECK-NEXT:  .LBB21_4: // %overflow.res
 ; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    bic w2, w9, w8
 ; CHECK-NEXT:    ret
@@ -586,13 +447,20 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_overflowing_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.eq .LBB22_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    b.ne .LBB22_3
+; CHECK-NEXT:  // %bb.1: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB22_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    b.ne .LBB22_3
+; CHECK-NEXT:  // %bb.2: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    and w2, wzr, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_3: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -603,103 +471,24 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
 ; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
 ; CHECK-NEXT:    mul x13, x0, x13
 ; CHECK-NEXT:    asr x11, x8, #63
-; CHECK-NEXT:    adds x9, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x9
-; CHECK-NEXT:    adc x9, x12, x13
-; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adds x8, x8, x9
-; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    adc x11, x11, x12
 ; CHECK-NEXT:    adds x8, x15, x8
-; CHECK-NEXT:    adc x10, x10, x11
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    b .LBB22_7
-; CHECK-NEXT:  .LBB22_3: // %overflow.no.lhs
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB22_8
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    asr x8, x1, #63
-; CHECK-NEXT:    asr x10, x3, #63
-; CHECK-NEXT:    eor x9, x0, x8
-; CHECK-NEXT:    eor x11, x1, x8
-; CHECK-NEXT:    eor x12, x2, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    eor x11, x3, x10
-; CHECK-NEXT:    csel x8, x8, x1, lt
-; CHECK-NEXT:    csel x9, x9, x0, lt
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    csel x11, x12, x2, lt
-; CHECK-NEXT:    csel x10, x10, x3, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w12, w13
-; CHECK-NEXT:    b .LBB22_6
-; CHECK-NEXT:  .LBB22_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    asr x8, x3, #63
-; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    eor x9, x2, x8
-; CHECK-NEXT:    eor x11, x3, x8
-; CHECK-NEXT:    eor x12, x0, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    eor x11, x1, x10
-; CHECK-NEXT:    csel x8, x8, x3, lt
-; CHECK-NEXT:    csel x9, x9, x2, lt
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    csel x11, x12, x0, lt
-; CHECK-NEXT:    csel x10, x10, x1, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w13, w12
-; CHECK-NEXT:  .LBB22_6: // %overflow.res
-; CHECK-NEXT:    sbfx x12, x10, #0, #1
-; CHECK-NEXT:    adds x8, x8, x11
-; CHECK-NEXT:    adc x9, x9, x15
-; CHECK-NEXT:    eor x13, x14, x12
-; CHECK-NEXT:    eor x8, x8, x12
-; CHECK-NEXT:    add x0, x13, x10
-; CHECK-NEXT:    cmp x0, x10
-; CHECK-NEXT:    cset w10, lo
-; CHECK-NEXT:    cinc x1, x8, lo
-; CHECK-NEXT:    eor x8, x9, x12
-; CHECK-NEXT:    cmp x1, x10
-; CHECK-NEXT:    cinc x8, x8, lo
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:  .LBB22_7: // %overflow.res
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    and w2, w8, #0x1
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB22_8: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
-; CHECK-NEXT:    and w2, wzr, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -713,13 +502,20 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_saturating_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.eq .LBB23_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    b.ne .LBB23_3
+; CHECK-NEXT:  // %bb.1: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB23_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    b.ne .LBB23_3
+; CHECK-NEXT:  // %bb.2: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x9, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    madd x8, x1, x2, x8
+; CHECK-NEXT:    b .LBB23_4
+; CHECK-NEXT:  .LBB23_3: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -729,109 +525,30 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adds x10, x11, x10
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    adc x9, x8, x9
 ; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    adds x9, x14, x10
+; CHECK-NEXT:    adds x8, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    asr x14, x9, #63
+; CHECK-NEXT:    asr x14, x8, #63
 ; CHECK-NEXT:    smulh x10, x1, x3
 ; CHECK-NEXT:    adc x11, x12, x13
-; CHECK-NEXT:    asr x12, x8, #63
+; CHECK-NEXT:    asr x12, x9, #63
 ; CHECK-NEXT:    asr x13, x11, #63
-; CHECK-NEXT:    adds x11, x8, x11
-; CHECK-NEXT:    mul x8, x0, x2
+; CHECK-NEXT:    adds x11, x9, x11
+; CHECK-NEXT:    mul x9, x0, x2
 ; CHECK-NEXT:    adc x12, x12, x13
 ; CHECK-NEXT:    adds x11, x15, x11
 ; CHECK-NEXT:    adc x10, x10, x12
 ; CHECK-NEXT:    cmp x11, x14
 ; CHECK-NEXT:    ccmp x10, x14, #0, eq
-; CHECK-NEXT:    b .LBB23_7
-; CHECK-NEXT:  .LBB23_3: // %overflow.no.lhs
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB23_8
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    asr x8, x1, #63
-; CHECK-NEXT:    asr x10, x3, #63
-; CHECK-NEXT:    eor x9, x0, x8
-; CHECK-NEXT:    eor x11, x1, x8
-; CHECK-NEXT:    eor x12, x2, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    eor x11, x3, x10
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    csel x8, x8, x1, lt
-; CHECK-NEXT:    csel x9, x9, x0, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    csel x11, x12, x2, lt
-; CHECK-NEXT:    csel x10, x10, x3, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w12, w13
-; CHECK-NEXT:    b .LBB23_6
-; CHECK-NEXT:  .LBB23_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    asr x8, x3, #63
-; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    eor x9, x2, x8
-; CHECK-NEXT:    eor x11, x3, x8
-; CHECK-NEXT:    eor x12, x0, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    eor x11, x1, x10
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    csel x8, x8, x3, lt
-; CHECK-NEXT:    csel x9, x9, x2, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    csel x11, x12, x0, lt
-; CHECK-NEXT:    csel x10, x10, x1, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w13, w12
-; CHECK-NEXT:  .LBB23_6: // %overflow.res
-; CHECK-NEXT:    sbfx x12, x10, #0, #1
-; CHECK-NEXT:    adds x11, x8, x11
-; CHECK-NEXT:    eor x13, x14, x12
-; CHECK-NEXT:    add x8, x13, x10
-; CHECK-NEXT:    adc x13, x9, x15
-; CHECK-NEXT:    eor x9, x11, x12
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    cset w10, lo
-; CHECK-NEXT:    cinc x9, x9, lo
-; CHECK-NEXT:    cmp x9, x10
-; CHECK-NEXT:    eor x10, x13, x12
-; CHECK-NEXT:    cinc x10, x10, lo
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:  .LBB23_7: // %overflow.res
 ; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:    b .LBB23_9
-; CHECK-NEXT:  .LBB23_8: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    mov w10, wzr
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    madd x9, x1, x2, x8
-; CHECK-NEXT:    mul x8, x0, x2
-; CHECK-NEXT:  .LBB23_9: // %overflow.res
+; CHECK-NEXT:  .LBB23_4: // %overflow.res
 ; CHECK-NEXT:    eor x11, x3, x1
 ; CHECK-NEXT:    tst w10, #0x1
 ; CHECK-NEXT:    asr x11, x11, #63
 ; CHECK-NEXT:    eor x12, x11, #0x7fffffffffffffff
-; CHECK-NEXT:    csinv x0, x8, x11, eq
-; CHECK-NEXT:    csel x1, x12, x9, ne
+; CHECK-NEXT:    csinv x0, x9, x11, eq
+; CHECK-NEXT:    csel x1, x12, x8, ne
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index ef004085373cd..34f8b10c24902 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -224,10 +224,9 @@ cleanup:
 define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_umul_i128:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cbz x1, .LBB4_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
-; CHECK-NEXT:    cbz x3, .LBB4_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB4_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
 ; CHECK-NEXT:    ccmp x3, #0, #4, ne
@@ -241,31 +240,16 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    tbnz w8, #0, .LBB4_7
-; CHECK-NEXT:    b .LBB4_8
-; CHECK-NEXT:  .LBB4_3: // %overflow.no.lhs
+; CHECK-NEXT:    tbnz w8, #0, .LBB4_3
+; CHECK-NEXT:    b .LBB4_4
+; CHECK-NEXT:  .LBB4_2: // %overflow.no
 ; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    cbz x3, .LBB4_9
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    madd x8, x1, x2, x8
-; CHECK-NEXT:    umulh x9, x0, x3
-; CHECK-NEXT:    mul x10, x0, x3
-; CHECK-NEXT:    mul x11, x1, x3
+; CHECK-NEXT:    madd x8, x0, x3, x8
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    b .LBB4_6
-; CHECK-NEXT:  .LBB4_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    umulh x8, x2, x0
-; CHECK-NEXT:    umulh x9, x2, x1
-; CHECK-NEXT:    madd x8, x3, x0, x8
-; CHECK-NEXT:    mul x10, x2, x1
-; CHECK-NEXT:    mul x11, x3, x1
-; CHECK-NEXT:    mul x0, x2, x0
-; CHECK-NEXT:  .LBB4_6: // %overflow.res
-; CHECK-NEXT:    adds x1, x8, x10
-; CHECK-NEXT:    adcs xzr, x9, x11
-; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    tbz w8, #0, .LBB4_8
-; CHECK-NEXT:  .LBB4_7: // %if.then
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbz w8, #0, .LBB4_4
+; CHECK-NEXT:  .LBB4_3: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -274,15 +258,8 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:  .LBB4_8: // %cleanup
+; CHECK-NEXT:  .LBB4_4: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB4_9: // %overflow.no
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    tbnz w8, #0, .LBB4_7
-; CHECK-NEXT:    b .LBB4_8
 entry:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
@@ -305,13 +282,21 @@ cleanup:
 define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_smul_i128:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.eq .LBB5_3
-; CHECK-NEXT:  // %bb.1: // %overflow.lhs
+; CHECK-NEXT:    b.ne .LBB5_3
+; CHECK-NEXT:  // %bb.1: // %overflow.entry
+; CHECK-NEXT:    asr x8, x2, #63
 ; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB5_5
-; CHECK-NEXT:  // %bb.2: // %overflow
+; CHECK-NEXT:    b.ne .LBB5_3
+; CHECK-NEXT:  // %bb.2: // %overflow.no
+; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbnz w8, #0, .LBB5_4
+; CHECK-NEXT:    b .LBB5_5
+; CHECK-NEXT:  .LBB5_3: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -322,97 +307,25 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    mul x14, x0, x3
 ; CHECK-NEXT:    umulh x12, x0, x3
 ; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
 ; CHECK-NEXT:    mul x13, x0, x13
 ; CHECK-NEXT:    asr x11, x8, #63
-; CHECK-NEXT:    adds x9, x14, x10
 ; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    smulh x10, x1, x3
-; CHECK-NEXT:    mov x1, x9
-; CHECK-NEXT:    adc x9, x12, x13
-; CHECK-NEXT:    asr x12, x9, #63
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    asr x10, x1, #63
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adds x8, x8, x9
-; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    adc x11, x11, x12
 ; CHECK-NEXT:    adds x8, x15, x8
-; CHECK-NEXT:    adc x10, x10, x11
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    b .LBB5_7
-; CHECK-NEXT:  .LBB5_3: // %overflow.no.lhs
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.eq .LBB5_10
-; CHECK-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; CHECK-NEXT:    asr x8, x1, #63
-; CHECK-NEXT:    asr x10, x3, #63
-; CHECK-NEXT:    eor x9, x0, x8
-; CHECK-NEXT:    eor x11, x1, x8
-; CHECK-NEXT:    eor x12, x2, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    eor x11, x3, x10
-; CHECK-NEXT:    csel x8, x8, x1, lt
-; CHECK-NEXT:    csel x9, x9, x0, lt
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    csel x11, x12, x2, lt
-; CHECK-NEXT:    csel x10, x10, x3, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w12, w13
-; CHECK-NEXT:    b .LBB5_6
-; CHECK-NEXT:  .LBB5_5: // %overflow.no.rhs.only
-; CHECK-NEXT:    asr x8, x3, #63
-; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    eor x9, x2, x8
-; CHECK-NEXT:    eor x11, x3, x8
-; CHECK-NEXT:    eor x12, x0, x10
-; CHECK-NEXT:    subs x9, x9, x8
-; CHECK-NEXT:    sbc x8, x11, x8
-; CHECK-NEXT:    cmp x3, #0
-; CHECK-NEXT:    eor x11, x1, x10
-; CHECK-NEXT:    csel x8, x8, x3, lt
-; CHECK-NEXT:    csel x9, x9, x2, lt
-; CHECK-NEXT:    cset w13, lt
-; CHECK-NEXT:    subs x12, x12, x10
-; CHECK-NEXT:    sbc x10, x11, x10
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    csel x11, x12, x0, lt
-; CHECK-NEXT:    csel x10, x10, x1, lt
-; CHECK-NEXT:    umulh x12, x9, x11
-; CHECK-NEXT:    mul x14, x9, x11
-; CHECK-NEXT:    mul x15, x8, x10
-; CHECK-NEXT:    madd x8, x8, x11, x12
-; CHECK-NEXT:    cset w12, lt
-; CHECK-NEXT:    mul x11, x9, x10
-; CHECK-NEXT:    umulh x9, x9, x10
-; CHECK-NEXT:    eor w10, w13, w12
-; CHECK-NEXT:  .LBB5_6: // %overflow.res
-; CHECK-NEXT:    sbfx x12, x10, #0, #1
-; CHECK-NEXT:    adds x8, x8, x11
-; CHECK-NEXT:    adc x9, x9, x15
-; CHECK-NEXT:    eor x13, x14, x12
-; CHECK-NEXT:    eor x8, x8, x12
-; CHECK-NEXT:    add x0, x13, x10
-; CHECK-NEXT:    cmp x0, x10
-; CHECK-NEXT:    cset w10, lo
-; CHECK-NEXT:    cinc x1, x8, lo
-; CHECK-NEXT:    eor x8, x9, x12
-; CHECK-NEXT:    cmp x1, x10
-; CHECK-NEXT:    cinc x8, x8, lo
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:  .LBB5_7: // %overflow.res
+; CHECK-NEXT:    adc x9, x9, x11
+; CHECK-NEXT:    cmp x8, x10
+; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    tbz w8, #0, .LBB5_9
-; CHECK-NEXT:  .LBB5_8: // %if.then
+; CHECK-NEXT:    tbz w8, #0, .LBB5_5
+; CHECK-NEXT:  .LBB5_4: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -421,16 +334,8 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:  .LBB5_9: // %cleanup
+; CHECK-NEXT:  .LBB5_5: // %cleanup
 ; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB5_10: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    tbnz w8, #0, .LBB5_8
-; CHECK-NEXT:    b .LBB5_9
 entry:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %1 = extractvalue { i128, i1 } %0, 1
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index a240055b3f655..14b1dc7f2d6df 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -4,10 +4,9 @@
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-LABEL: muloti_test:
 ; AARCH:       // %bb.0: // %overflow.entry
-; AARCH-NEXT:    cbz x1, .LBB0_3
-; AARCH-NEXT:  // %bb.1: // %overflow.lhs
-; AARCH-NEXT:    cbz x3, .LBB0_5
-; AARCH-NEXT:  // %bb.2: // %overflow
+; AARCH-NEXT:    orr x8, x1, x3
+; AARCH-NEXT:    cbz x8, .LBB0_2
+; AARCH-NEXT:  // %bb.1: // %overflow
 ; AARCH-NEXT:    mul x9, x3, x0
 ; AARCH-NEXT:    cmp x1, #0
 ; AARCH-NEXT:    ccmp x3, #0, #4, ne
@@ -23,30 +22,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-NEXT:    csinc w8, w8, wzr, lo
 ; AARCH-NEXT:    and w2, w8, #0x1
 ; AARCH-NEXT:    ret
-; AARCH-NEXT:  .LBB0_3: // %overflow.no.lhs
+; AARCH-NEXT:  .LBB0_2: // %overflow.no
 ; AARCH-NEXT:    umulh x8, x0, x2
-; AARCH-NEXT:    cbz x3, .LBB0_7
-; AARCH-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; AARCH-NEXT:    madd x8, x1, x2, x8
-; AARCH-NEXT:    umulh x9, x0, x3
-; AARCH-NEXT:    mul x10, x0, x3
-; AARCH-NEXT:    mul x11, x1, x3
-; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    b .LBB0_6
-; AARCH-NEXT:  .LBB0_5: // %overflow.no.rhs.only
-; AARCH-NEXT:    umulh x8, x2, x0
-; AARCH-NEXT:    umulh x9, x2, x1
-; AARCH-NEXT:    madd x8, x3, x0, x8
-; AARCH-NEXT:    mul x10, x2, x1
-; AARCH-NEXT:    mul x11, x3, x1
-; AARCH-NEXT:    mul x0, x2, x0
-; AARCH-NEXT:  .LBB0_6: // %overflow.res
-; AARCH-NEXT:    adds x1, x8, x10
-; AARCH-NEXT:    adcs xzr, x9, x11
-; AARCH-NEXT:    cset w8, ne
-; AARCH-NEXT:    and w2, w8, #0x1
-; AARCH-NEXT:    ret
-; AARCH-NEXT:  .LBB0_7: // %overflow.no
 ; AARCH-NEXT:    madd x8, x0, x3, x8
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    madd x1, x1, x2, x8
@@ -69,14 +46,22 @@ start:
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
 ; AARCH:       // %bb.0: // %overflow.entry
-; AARCH-NEXT:    asr x8, x2, #63
 ; AARCH-NEXT:    cmp x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    b.eq .LBB1_3
-; AARCH-NEXT:  // %bb.1: // %overflow.lhs
+; AARCH-NEXT:    b.ne .LBB1_3
+; AARCH-NEXT:  // %bb.1: // %overflow.entry
+; AARCH-NEXT:    asr x8, x2, #63
 ; AARCH-NEXT:    cmp x3, x8
-; AARCH-NEXT:    b.eq .LBB1_5
-; AARCH-NEXT:  // %bb.2: // %overflow
+; AARCH-NEXT:    b.ne .LBB1_3
+; AARCH-NEXT:  // %bb.2: // %overflow.no
+; AARCH-NEXT:    umulh x8, x0, x2
+; AARCH-NEXT:    mov w9, wzr
+; AARCH-NEXT:    madd x8, x0, x3, x8
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    madd x8, x1, x2, x8
+; AARCH-NEXT:    tbnz x1, #63, .LBB1_4
+; AARCH-NEXT:    b .LBB1_5
+; AARCH-NEXT:  .LBB1_3: // %overflow
 ; AARCH-NEXT:    asr x9, x1, #63
 ; AARCH-NEXT:    umulh x10, x0, x2
 ; AARCH-NEXT:    asr x13, x3, #63
@@ -94,106 +79,26 @@ define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4
 ; AARCH-NEXT:    adc x11, x12, x13
 ; AARCH-NEXT:    asr x12, x9, #63
 ; AARCH-NEXT:    asr x13, x11, #63
-; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    adds x9, x9, x11
 ; AARCH-NEXT:    asr x11, x8, #63
+; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    adc x12, x12, x13
 ; AARCH-NEXT:    adds x9, x15, x9
 ; AARCH-NEXT:    adc x10, x10, x12
 ; AARCH-NEXT:    cmp x9, x11
 ; AARCH-NEXT:    ccmp x10, x11, #0, eq
-; AARCH-NEXT:    b .LBB1_7
-; AARCH-NEXT:  .LBB1_3: // %overflow.no.lhs
-; AARCH-NEXT:    cmp x3, x8
-; AARCH-NEXT:    b.eq .LBB1_8
-; AARCH-NEXT:  // %bb.4: // %overflow.no.lhs.only
-; AARCH-NEXT:    asr x8, x1, #63
-; AARCH-NEXT:    asr x10, x3, #63
-; AARCH-NEXT:    eor x9, x0, x8
-; AARCH-NEXT:    eor x11, x1, x8
-; AARCH-NEXT:    eor x12, x2, x10
-; AARCH-NEXT:    subs x9, x9, x8
-; AARCH-NEXT:    sbc x8, x11, x8
-; AARCH-NEXT:    cmp x1, #0
-; AARCH-NEXT:    eor x11, x3, x10
-; AARCH-NEXT:    cset w13, lt
-; AARCH-NEXT:    csel x8, x8, x1, lt
-; AARCH-NEXT:    csel x9, x9, x0, lt
-; AARCH-NEXT:    subs x12, x12, x10
-; AARCH-NEXT:    sbc x10, x11, x10
-; AARCH-NEXT:    cmp x3, #0
-; AARCH-NEXT:    csel x11, x12, x2, lt
-; AARCH-NEXT:    csel x10, x10, x3, lt
-; AARCH-NEXT:    umulh x12, x9, x11
-; AARCH-NEXT:    mul x15, x8, x10
-; AARCH-NEXT:    madd x8, x8, x11, x12
-; AARCH-NEXT:    cset w12, lt
-; AARCH-NEXT:    mul x14, x9, x11
-; AARCH-NEXT:    mul x11, x9, x10
-; AARCH-NEXT:    umulh x9, x9, x10
-; AARCH-NEXT:    eor w10, w12, w13
-; AARCH-NEXT:    b .LBB1_6
-; AARCH-NEXT:  .LBB1_5: // %overflow.no.rhs.only
-; AARCH-NEXT:    asr x8, x3, #63
-; AARCH-NEXT:    asr x10, x1, #63
-; AARCH-NEXT:    eor x9, x2, x8
-; AARCH-NEXT:    eor x11, x3, x8
-; AARCH-NEXT:    eor x12, x0, x10
-; AARCH-NEXT:    subs x9, x9, x8
-; AARCH-NEXT:    sbc x8, x11, x8
-; AARCH-NEXT:    cmp x3, #0
-; AARCH-NEXT:    eor x11, x1, x10
-; AARCH-NEXT:    cset w13, lt
-; AARCH-NEXT:    csel x8, x8, x3, lt
-; AARCH-NEXT:    csel x9, x9, x2, lt
-; AARCH-NEXT:    subs x12, x12, x10
-; AARCH-NEXT:    sbc x10, x11, x10
-; AARCH-NEXT:    cmp x1, #0
-; AARCH-NEXT:    csel x11, x12, x0, lt
-; AARCH-NEXT:    csel x10, x10, x1, lt
-; AARCH-NEXT:    umulh x12, x9, x11
-; AARCH-NEXT:    mul x14, x9, x11
-; AARCH-NEXT:    mul x15, x8, x10
-; AARCH-NEXT:    madd x8, x8, x11, x12
-; AARCH-NEXT:    cset w12, lt
-; AARCH-NEXT:    mul x11, x9, x10
-; AARCH-NEXT:    umulh x9, x9, x10
-; AARCH-NEXT:    eor w10, w13, w12
-; AARCH-NEXT:  .LBB1_6: // %overflow.res
-; AARCH-NEXT:    sbfx x12, x10, #0, #1
-; AARCH-NEXT:    adds x8, x8, x11
-; AARCH-NEXT:    adc x9, x9, x15
-; AARCH-NEXT:    eor x13, x14, x12
-; AARCH-NEXT:    eor x8, x8, x12
-; AARCH-NEXT:    eor x9, x9, x12
-; AARCH-NEXT:    add x0, x13, x10
-; AARCH-NEXT:    cmp x0, x10
-; AARCH-NEXT:    cset w10, lo
-; AARCH-NEXT:    cinc x8, x8, lo
-; AARCH-NEXT:    cmp x8, x10
-; AARCH-NEXT:    cinc x9, x9, lo
-; AARCH-NEXT:    cmp x9, #0
-; AARCH-NEXT:  .LBB1_7: // %overflow.res
 ; AARCH-NEXT:    cset w9, ne
-; AARCH-NEXT:    tbnz x1, #63, .LBB1_9
-; AARCH-NEXT:    b .LBB1_10
-; AARCH-NEXT:  .LBB1_8: // %overflow.no
-; AARCH-NEXT:    umulh x8, x0, x2
-; AARCH-NEXT:    mov w9, wzr
-; AARCH-NEXT:    madd x8, x0, x3, x8
-; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    madd x8, x1, x2, x8
-; AARCH-NEXT:    tbz x1, #63, .LBB1_10
-; AARCH-NEXT:  .LBB1_9: // %overflow.res
+; AARCH-NEXT:    tbz x1, #63, .LBB1_5
+; AARCH-NEXT:  .LBB1_4: // %overflow.res
 ; AARCH-NEXT:    eor x10, x3, #0x8000000000000000
 ; AARCH-NEXT:    orr x10, x2, x10
-; AARCH-NEXT:    cbz x10, .LBB1_11
-; AARCH-NEXT:  .LBB1_10: // %Else2
-; AARCH-NEXT:    tbz w9, #0, .LBB1_12
-; AARCH-NEXT:  .LBB1_11: // %Then7
+; AARCH-NEXT:    cbz x10, .LBB1_6
+; AARCH-NEXT:  .LBB1_5: // %Else2
+; AARCH-NEXT:    tbz w9, #0, .LBB1_7
+; AARCH-NEXT:  .LBB1_6: // %Then7
 ; AARCH-NEXT:    mov w9, #1 // =0x1
 ; AARCH-NEXT:    str w9, [x4]
-; AARCH-NEXT:  .LBB1_12: // %Block9
+; AARCH-NEXT:  .LBB1_7: // %Block9
 ; AARCH-NEXT:    mov x1, x8
 ; AARCH-NEXT:    ret
 Entry:
diff --git a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
index 8f35b6df7a937..4eb82c80e2bff 100644
--- a/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-128-legalisation-lowering.ll
@@ -4,425 +4,212 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; ARMV6-LABEL: muloti_test:
-; ARMV6:       @ %bb.0: @ %overflow.entry
+; ARMV6:       @ %bb.0: @ %start
 ; ARMV6-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; ARMV6-NEXT:    sub sp, sp, #28
-; ARMV6-NEXT:    add lr, sp, #76
+; ARMV6-NEXT:    ldr r4, [sp, #72]
+; ARMV6-NEXT:    mov r7, r0
+; ARMV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r12, [sp, #64]
+; ARMV6-NEXT:    umull r1, r0, r2, r4
 ; ARMV6-NEXT:    ldr r5, [sp, #68]
-; ARMV6-NEXT:    ldr r6, [sp, #64]
-; ARMV6-NEXT:    mov r9, r0
-; ARMV6-NEXT:    ldr r11, [sp, #72]
-; ARMV6-NEXT:    orrs r10, r6, r5
-; ARMV6-NEXT:    ldm lr, {r1, r12, lr}
-; ARMV6-NEXT:    beq .LBB0_3
-; ARMV6-NEXT:  @ %bb.1: @ %overflow.lhs
-; ARMV6-NEXT:    orrs r8, r12, lr
-; ARMV6-NEXT:    beq .LBB0_5
-; ARMV6-NEXT:  @ %bb.2: @ %overflow
-; ARMV6-NEXT:    umull r4, r0, r3, r12
-; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; ARMV6-NEXT:    umull r7, r0, lr, r2
-; ARMV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; ARMV6-NEXT:    umull r0, r12, r12, r2
-; ARMV6-NEXT:    add r4, r7, r4
-; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; ARMV6-NEXT:    mov r0, #0
-; ARMV6-NEXT:    adds r7, r12, r4
-; ARMV6-NEXT:    str r7, [sp] @ 4-byte Spill
-; ARMV6-NEXT:    adc r0, r0, #0
-; ARMV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; ARMV6-NEXT:    mov r0, r11
-; ARMV6-NEXT:    umull r11, r12, r1, r6
-; ARMV6-NEXT:    umull r7, r4, r5, r0
-; ARMV6-NEXT:    add r7, r7, r11
-; ARMV6-NEXT:    umull r11, r6, r6, r0
-; ARMV6-NEXT:    adds r6, r6, r7
-; ARMV6-NEXT:    mov r7, #0
-; ARMV6-NEXT:    adc r7, r7, #0
-; ARMV6-NEXT:    str r7, [sp, #4] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
-; ARMV6-NEXT:    adds r7, r11, r7
-; ARMV6-NEXT:    str r7, [sp, #8] @ 4-byte Spill
-; ARMV6-NEXT:    ldr r7, [sp] @ 4-byte Reload
-; ARMV6-NEXT:    adc r6, r6, r7
-; ARMV6-NEXT:    str r6, [sp] @ 4-byte Spill
-; ARMV6-NEXT:    umull r11, r6, r2, r0
-; ARMV6-NEXT:    mov r7, #0
-; ARMV6-NEXT:    umlal r6, r7, r3, r0
-; ARMV6-NEXT:    umull r2, r0, r2, r1
-; ARMV6-NEXT:    adds r2, r2, r6
-; ARMV6-NEXT:    str r2, [sp, #24] @ 4-byte Spill
-; ARMV6-NEXT:    adcs r0, r7, r0
-; ARMV6-NEXT:    mov r7, #0
-; ARMV6-NEXT:    adc r6, r7, #0
-; ARMV6-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; ARMV6-NEXT:    umlal r0, r6, r3, r1
-; ARMV6-NEXT:    adds r2, r0, r2
-; ARMV6-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; ARMV6-NEXT:    adcs r0, r6, r0
-; ARMV6-NEXT:    adc r6, r7, #0
-; ARMV6-NEXT:    cmp r8, #0
-; ARMV6-NEXT:    movne r8, #1
-; ARMV6-NEXT:    cmp r10, #0
+; ARMV6-NEXT:    str r1, [r7]
+; ARMV6-NEXT:    ldr r1, [sp, #76]
+; ARMV6-NEXT:    umull r7, r6, r1, r12
+; ARMV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; ARMV6-NEXT:    umull r6, r9, r5, r4
+; ARMV6-NEXT:    add r7, r6, r7
+; ARMV6-NEXT:    umull r4, r6, r12, r4
+; ARMV6-NEXT:    str r4, [sp, #16] @ 4-byte Spill
+; ARMV6-NEXT:    mov r4, #0
+; ARMV6-NEXT:    adds r8, r6, r7
+; ARMV6-NEXT:    ldr r6, [sp, #80]
+; ARMV6-NEXT:    adc r7, r4, #0
+; ARMV6-NEXT:    ldr r4, [sp, #84]
+; ARMV6-NEXT:    str r7, [sp, #24] @ 4-byte Spill
+; ARMV6-NEXT:    umull r12, lr, r3, r6
+; ARMV6-NEXT:    umull r11, r7, r4, r2
+; ARMV6-NEXT:    add r12, r11, r12
+; ARMV6-NEXT:    umull r11, r10, r6, r2
+; ARMV6-NEXT:    adds r12, r10, r12
+; ARMV6-NEXT:    mov r10, #0
+; ARMV6-NEXT:    adc r6, r10, #0
+; ARMV6-NEXT:    str r6, [sp, #20] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
+; ARMV6-NEXT:    adds r6, r6, r11
+; ARMV6-NEXT:    str r6, [sp, #12] @ 4-byte Spill
+; ARMV6-NEXT:    adc r6, r8, r12
+; ARMV6-NEXT:    str r6, [sp, #16] @ 4-byte Spill
+; ARMV6-NEXT:    ldr r6, [sp, #72]
+; ARMV6-NEXT:    mov r12, #0
+; ARMV6-NEXT:    umull r2, r8, r2, r1
+; ARMV6-NEXT:    umlal r0, r12, r3, r6
+; ARMV6-NEXT:    adds r0, r2, r0
+; ARMV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
+; ARMV6-NEXT:    adcs r8, r12, r8
+; ARMV6-NEXT:    adc r12, r10, #0
+; ARMV6-NEXT:    cmp lr, #0
+; ARMV6-NEXT:    str r0, [r2, #4]
+; ARMV6-NEXT:    movne lr, #1
+; ARMV6-NEXT:    ldr r11, [sp, #8] @ 4-byte Reload
+; ARMV6-NEXT:    cmp r7, #0
+; ARMV6-NEXT:    movne r7, #1
+; ARMV6-NEXT:    ldr r0, [sp, #64]
+; ARMV6-NEXT:    cmp r11, #0
+; ARMV6-NEXT:    umlal r8, r12, r3, r1
+; ARMV6-NEXT:    movne r11, #1
+; ARMV6-NEXT:    cmp r9, #0
+; ARMV6-NEXT:    movne r9, #1
+; ARMV6-NEXT:    orrs r10, r0, r5
+; ARMV6-NEXT:    ldr r0, [sp, #80]
 ; ARMV6-NEXT:    movne r10, #1
+; ARMV6-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
+; ARMV6-NEXT:    orrs r0, r0, r4
+; ARMV6-NEXT:    movne r0, #1
 ; ARMV6-NEXT:    cmp r4, #0
 ; ARMV6-NEXT:    movne r4, #1
-; ARMV6-NEXT:    cmp r1, #0
-; ARMV6-NEXT:    movne r1, #1
+; ARMV6-NEXT:    cmp r3, #0
+; ARMV6-NEXT:    movne r3, #1
 ; ARMV6-NEXT:    cmp r5, #0
 ; ARMV6-NEXT:    movne r5, #1
+; ARMV6-NEXT:    cmp r1, #0
+; ARMV6-NEXT:    movne r1, #1
+; ARMV6-NEXT:    adds r6, r8, r6
+; ARMV6-NEXT:    str r6, [r2, #8]
 ; ARMV6-NEXT:    and r1, r5, r1
-; ARMV6-NEXT:    cmp r12, #0
-; ARMV6-NEXT:    orr r1, r1, r4
-; ARMV6-NEXT:    ldr r5, [sp, #4] @ 4-byte Reload
-; ARMV6-NEXT:    movne r12, #1
-; ARMV6-NEXT:    orr r1, r1, r12
-; ARMV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
-; ARMV6-NEXT:    and r6, r10, r8
-; ARMV6-NEXT:    orr r1, r1, r5
-; ARMV6-NEXT:    orr r1, r6, r1
-; ARMV6-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
-; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
-; ARMV6-NEXT:    cmp r6, #0
-; ARMV6-NEXT:    movne r6, #1
-; ARMV6-NEXT:    cmp r3, #0
-; ARMV6-NEXT:    movne r3, #1
-; ARMV6-NEXT:    cmp lr, #0
-; ARMV6-NEXT:    movne lr, #1
-; ARMV6-NEXT:    and r3, lr, r3
-; ARMV6-NEXT:    orr r3, r3, r6
-; ARMV6-NEXT:    ldr r6, [sp, #20] @ 4-byte Reload
-; ARMV6-NEXT:    cmp r6, #0
-; ARMV6-NEXT:    movne r6, #1
-; ARMV6-NEXT:    orr r3, r3, r6
 ; ARMV6-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
-; ARMV6-NEXT:    orr r3, r3, r6
+; ARMV6-NEXT:    orr r1, r1, r9
+; ARMV6-NEXT:    orr r1, r1, r11
+; ARMV6-NEXT:    and r0, r10, r0
+; ARMV6-NEXT:    adcs r6, r12, r6
+; ARMV6-NEXT:    str r6, [r2, #12]
+; ARMV6-NEXT:    ldr r6, [sp, #24] @ 4-byte Reload
+; ARMV6-NEXT:    orr r1, r1, r6
+; ARMV6-NEXT:    orr r0, r0, r1
+; ARMV6-NEXT:    and r1, r4, r3
+; ARMV6-NEXT:    orr r1, r1, r7
+; ARMV6-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
+; ARMV6-NEXT:    orr r1, r1, lr
 ; ARMV6-NEXT:    orr r1, r1, r3
-; ARMV6-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
-; ARMV6-NEXT:    orr r6, r1, r3
-; ARMV6-NEXT:    b .LBB0_8
-; ARMV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
-; ARMV6-NEXT:    orrs r6, r12, lr
-; ARMV6-NEXT:    beq .LBB0_7
-; ARMV6-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
-; ARMV6-NEXT:    umull r0, r4, r2, r12
-; ARMV6-NEXT:    mov r7, #0
-; ARMV6-NEXT:    mov r10, #0
-; ARMV6-NEXT:    umlal r4, r7, r3, r12
-; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; ARMV6-NEXT:    umull r6, r8, r2, lr
-; ARMV6-NEXT:    adds r0, r6, r4
-; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; ARMV6-NEXT:    adcs r6, r7, r8
-; ARMV6-NEXT:    adc r7, r10, #0
-; ARMV6-NEXT:    ldr r10, [sp, #64]
-; ARMV6-NEXT:    umlal r6, r7, r3, lr
-; ARMV6-NEXT:    umull r0, r8, r12, r10
-; ARMV6-NEXT:    mla r4, r12, r5, r8
-; ARMV6-NEXT:    mov r8, r11
-; ARMV6-NEXT:    adds r12, r6, r0
-; ARMV6-NEXT:    mov r6, #0
-; ARMV6-NEXT:    mla r4, lr, r10, r4
-; ARMV6-NEXT:    adc lr, r7, r4
-; ARMV6-NEXT:    umull r11, r4, r2, r11
-; ARMV6-NEXT:    umlal r4, r6, r3, r8
-; ARMV6-NEXT:    umull r2, r0, r2, r1
-; ARMV6-NEXT:    adds r7, r2, r4
-; ARMV6-NEXT:    adcs r2, r6, r0
-; ARMV6-NEXT:    mov r0, #0
-; ARMV6-NEXT:    adc r4, r0, #0
-; ARMV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; ARMV6-NEXT:    umlal r2, r4, r3, r1
-; ARMV6-NEXT:    umull r3, r6, r8, r10
-; ARMV6-NEXT:    mla r5, r8, r5, r6
-; ARMV6-NEXT:    adds r2, r2, r3
-; ARMV6-NEXT:    mla r1, r1, r10, r5
-; ARMV6-NEXT:    adc r1, r4, r1
-; ARMV6-NEXT:    adds r2, r2, r0
-; ARMV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; ARMV6-NEXT:    adcs r0, r1, r0
-; ARMV6-NEXT:    adcs r1, r12, #0
-; ARMV6-NEXT:    adc r3, lr, #0
-; ARMV6-NEXT:    b .LBB0_6
-; ARMV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
-; ARMV6-NEXT:    mov r10, r6
-; ARMV6-NEXT:    umull r0, r6, r11, r6
-; ARMV6-NEXT:    mov r7, #0
-; ARMV6-NEXT:    umlal r6, r7, r1, r10
-; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; ARMV6-NEXT:    umull r4, r8, r11, r5
-; ARMV6-NEXT:    adds r0, r4, r6
-; ARMV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
-; ARMV6-NEXT:    adcs r6, r7, r8
-; ARMV6-NEXT:    mov r0, #0
-; ARMV6-NEXT:    adc r7, r0, #0
-; ARMV6-NEXT:    umull r0, r8, r10, r12
-; ARMV6-NEXT:    mla r4, r10, lr, r8
-; ARMV6-NEXT:    umlal r6, r7, r1, r5
-; ARMV6-NEXT:    mla r4, r5, r12, r4
-; ARMV6-NEXT:    adds r10, r6, r0
-; ARMV6-NEXT:    adc r0, r7, r4
-; ARMV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; ARMV6-NEXT:    mov r0, r11
-; ARMV6-NEXT:    umull r11, r6, r11, r2
-; ARMV6-NEXT:    mov r7, #0
-; ARMV6-NEXT:    umull r4, r5, r0, r3
-; ARMV6-NEXT:    mov r0, #0
-; ARMV6-NEXT:    umlal r6, r7, r1, r2
-; ARMV6-NEXT:    adds r8, r4, r6
-; ARMV6-NEXT:    adcs r4, r7, r5
-; ARMV6-NEXT:    adc r5, r0, #0
-; ARMV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; ARMV6-NEXT:    umlal r4, r5, r1, r3
-; ARMV6-NEXT:    mov r7, r8
-; ARMV6-NEXT:    umull r1, r6, r2, r12
-; ARMV6-NEXT:    mla r2, r2, lr, r6
-; ARMV6-NEXT:    adds r1, r4, r1
-; ARMV6-NEXT:    mla r2, r3, r12, r2
-; ARMV6-NEXT:    adc r3, r5, r2
-; ARMV6-NEXT:    adds r2, r1, r0
-; ARMV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; ARMV6-NEXT:    adcs r0, r3, r0
-; ARMV6-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
-; ARMV6-NEXT:    adcs r1, r10, #0
-; ARMV6-NEXT:    adc r3, r3, #0
-; ARMV6-NEXT:  .LBB0_6: @ %overflow.res
-; ARMV6-NEXT:    orrs r6, r1, r3
-; ARMV6-NEXT:    movne r6, #1
-; ARMV6-NEXT:    b .LBB0_8
-; ARMV6-NEXT:  .LBB0_7: @ %overflow.no
-; ARMV6-NEXT:    mov r0, r11
-; ARMV6-NEXT:    umull r11, r8, r2, r11
-; ARMV6-NEXT:    mov r7, #0
-; ARMV6-NEXT:    mov r6, #0
-; ARMV6-NEXT:    umlal r8, r7, r3, r0
-; ARMV6-NEXT:    umull r4, r10, r2, r1
-; ARMV6-NEXT:    adds r0, r4, r8
-; ARMV6-NEXT:    ldr r4, [sp, #64]
-; ARMV6-NEXT:    adcs r10, r7, r10
-; ARMV6-NEXT:    ldr r7, [sp, #72]
-; ARMV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; ARMV6-NEXT:    adc r0, r6, #0
-; ARMV6-NEXT:    umlal r10, r0, r3, r1
-; ARMV6-NEXT:    umull r8, r4, r7, r4
-; ARMV6-NEXT:    mla r4, r7, r5, r4
-; ARMV6-NEXT:    ldr r5, [sp, #64]
-; ARMV6-NEXT:    ldr r7, [sp, #24] @ 4-byte Reload
-; ARMV6-NEXT:    mla r1, r1, r5, r4
-; ARMV6-NEXT:    umull r4, r5, r12, r2
-; ARMV6-NEXT:    mla r3, r12, r3, r5
-; ARMV6-NEXT:    mla r2, lr, r2, r3
-; ARMV6-NEXT:    adds r3, r4, r8
-; ARMV6-NEXT:    adc r1, r2, r1
-; ARMV6-NEXT:    adds r2, r10, r3
-; ARMV6-NEXT:    adc r0, r0, r1
-; ARMV6-NEXT:  .LBB0_8: @ %overflow.res
-; ARMV6-NEXT:    str r11, [r9]
-; ARMV6-NEXT:    str r7, [r9, #4]
-; ARMV6-NEXT:    str r2, [r9, #8]
-; ARMV6-NEXT:    str r0, [r9, #12]
-; ARMV6-NEXT:    and r0, r6, #1
-; ARMV6-NEXT:    strb r0, [r9, #16]
+; ARMV6-NEXT:    orr r0, r0, r1
+; ARMV6-NEXT:    mov r1, #0
+; ARMV6-NEXT:    adc r1, r1, #0
+; ARMV6-NEXT:    orr r0, r0, r1
+; ARMV6-NEXT:    and r0, r0, #1
+; ARMV6-NEXT:    strb r0, [r2, #16]
 ; ARMV6-NEXT:    add sp, sp, #28
 ; ARMV6-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 ;
 ; ARMV7-LABEL: muloti_test:
-; ARMV7:       @ %bb.0: @ %overflow.entry
+; ARMV7:       @ %bb.0: @ %start
 ; ARMV7-NEXT:    push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; ARMV7-NEXT:    sub sp, sp, #12
-; ARMV7-NEXT:    ldr r7, [sp, #52]
-; ARMV7-NEXT:    ldr r10, [sp, #48]
-; ARMV7-NEXT:    ldr r4, [sp, #68]
-; ARMV7-NEXT:    ldr r9, [sp, #64]
-; ARMV7-NEXT:    orrs r1, r10, r7
-; ARMV7-NEXT:    ldr r12, [sp, #60]
-; ARMV7-NEXT:    ldr lr, [sp, #56]
-; ARMV7-NEXT:    beq .LBB0_3
-; ARMV7-NEXT:  @ %bb.1: @ %overflow.lhs
-; ARMV7-NEXT:    orr r5, r9, r4
-; ARMV7-NEXT:    cmp r5, #0
-; ARMV7-NEXT:    beq .LBB0_5
-; ARMV7-NEXT:  @ %bb.2: @ %overflow
-; ARMV7-NEXT:    movwne r5, #1
+; ARMV7-NEXT:    sub sp, sp, #44
+; ARMV7-NEXT:    ldr r8, [sp, #88]
+; ARMV7-NEXT:    mov r9, r0
+; ARMV7-NEXT:    ldr r7, [sp, #96]
+; ARMV7-NEXT:    ldr lr, [sp, #100]
+; ARMV7-NEXT:    umull r0, r5, r2, r8
+; ARMV7-NEXT:    ldr r4, [sp, #80]
+; ARMV7-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; ARMV7-NEXT:    umull r1, r0, r3, r7
+; ARMV7-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; ARMV7-NEXT:    umull r0, r11, lr, r2
+; ARMV7-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r1, [sp, #92]
+; ARMV7-NEXT:    str r0, [sp] @ 4-byte Spill
+; ARMV7-NEXT:    umull r0, r10, r7, r2
+; ARMV7-NEXT:    mov r7, r1
+; ARMV7-NEXT:    umull r6, r12, r1, r4
+; ARMV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r0, [sp, #84]
+; ARMV7-NEXT:    str r6, [sp, #24] @ 4-byte Spill
+; ARMV7-NEXT:    umull r6, r1, r0, r8
+; ARMV7-NEXT:    str r6, [sp, #16] @ 4-byte Spill
+; ARMV7-NEXT:    umull r6, r2, r2, r7
+; ARMV7-NEXT:    mov r7, r4
+; ARMV7-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; ARMV7-NEXT:    str r2, [sp, #12] @ 4-byte Spill
+; ARMV7-NEXT:    umull r2, r6, r4, r8
+; ARMV7-NEXT:    str r2, [sp, #36] @ 4-byte Spill
+; ARMV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; ARMV7-NEXT:    str r6, [sp, #28] @ 4-byte Spill
+; ARMV7-NEXT:    mov r6, #0
+; ARMV7-NEXT:    str r2, [r9]
+; ARMV7-NEXT:    umlal r5, r6, r3, r8
+; ARMV7-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; ARMV7-NEXT:    ldr r4, [sp] @ 4-byte Reload
+; ARMV7-NEXT:    add r4, r4, r2
+; ARMV7-NEXT:    adds r2, r10, r4
+; ARMV7-NEXT:    str r2, [sp, #20] @ 4-byte Spill
+; ARMV7-NEXT:    mov r2, #0
+; ARMV7-NEXT:    adc r2, r2, #0
+; ARMV7-NEXT:    cmp r12, #0
+; ARMV7-NEXT:    str r2, [sp, #32] @ 4-byte Spill
+; ARMV7-NEXT:    movwne r12, #1
 ; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    mov r6, r12
+; ARMV7-NEXT:    ldr r2, [sp, #96]
 ; ARMV7-NEXT:    movwne r1, #1
-; ARMV7-NEXT:    and r12, r1, r5
-; ARMV7-NEXT:    cmp r6, #0
-; ARMV7-NEXT:    mov r1, r6
-; ARMV7-NEXT:    mov r8, r6
-; ARMV7-NEXT:    umull r6, r5, r7, lr
-; ARMV7-NEXT:    movwne r1, #1
-; ARMV7-NEXT:    cmp r7, #0
-; ARMV7-NEXT:    movwne r7, #1
-; ARMV7-NEXT:    and r1, r7, r1
-; ARMV7-NEXT:    mov r11, #0
-; ARMV7-NEXT:    cmp r5, #0
-; ARMV7-NEXT:    movwne r5, #1
-; ARMV7-NEXT:    orr r1, r1, r5
-; ARMV7-NEXT:    umull r5, r7, r8, r10
-; ARMV7-NEXT:    cmp r7, #0
+; ARMV7-NEXT:    orrs r10, r7, r0
+; ARMV7-NEXT:    movwne r10, #1
+; ARMV7-NEXT:    orrs r7, r2, lr
+; ARMV7-NEXT:    ldr r2, [sp, #92]
 ; ARMV7-NEXT:    movwne r7, #1
-; ARMV7-NEXT:    orr r7, r1, r7
-; ARMV7-NEXT:    add r1, r6, r5
-; ARMV7-NEXT:    umull r8, r6, r10, lr
-; ARMV7-NEXT:    adds r10, r6, r1
-; ARMV7-NEXT:    umull r6, r1, r4, r2
-; ARMV7-NEXT:    adc r5, r11, #0
-; ARMV7-NEXT:    orr r5, r7, r5
-; ARMV7-NEXT:    orr r7, r12, r5
-; ARMV7-NEXT:    cmp r3, #0
-; ARMV7-NEXT:    mov r5, r3
-; ARMV7-NEXT:    movwne r5, #1
-; ARMV7-NEXT:    cmp r4, #0
+; ARMV7-NEXT:    cmp r0, #0
+; ARMV7-NEXT:    movwne r0, #1
+; ARMV7-NEXT:    cmp r2, #0
+; ARMV7-NEXT:    mov r4, r2
+; ARMV7-NEXT:    mov r8, r2
+; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
 ; ARMV7-NEXT:    movwne r4, #1
-; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    and r5, r4, r5
-; ARMV7-NEXT:    movwne r1, #1
-; ARMV7-NEXT:    orr r1, r5, r1
-; ARMV7-NEXT:    umull r5, r4, r3, r9
+; ARMV7-NEXT:    and r0, r0, r4
+; ARMV7-NEXT:    mov r4, #0
+; ARMV7-NEXT:    adds r5, r2, r5
+; ARMV7-NEXT:    str r5, [r9, #4]
+; ARMV7-NEXT:    orr r0, r0, r1
+; ARMV7-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; ARMV7-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; ARMV7-NEXT:    and r5, r10, r7
+; ARMV7-NEXT:    orr r0, r0, r12
+; ARMV7-NEXT:    mov r12, #0
+; ARMV7-NEXT:    add r1, r2, r1
+; ARMV7-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; ARMV7-NEXT:    adcs r2, r6, r2
+; ARMV7-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; ARMV7-NEXT:    adc r7, r4, #0
+; ARMV7-NEXT:    adds r1, r6, r1
+; ARMV7-NEXT:    umlal r2, r7, r3, r8
+; ARMV7-NEXT:    adc r4, r4, #0
+; ARMV7-NEXT:    orr r0, r0, r4
+; ARMV7-NEXT:    orr r0, r5, r0
+; ARMV7-NEXT:    ldr r4, [sp, #40] @ 4-byte Reload
+; ARMV7-NEXT:    ldr r5, [sp, #36] @ 4-byte Reload
+; ARMV7-NEXT:    adds r5, r5, r4
+; ARMV7-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; ARMV7-NEXT:    adc r1, r1, r4
+; ARMV7-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
 ; ARMV7-NEXT:    cmp r4, #0
-; ARMV7-NEXT:    add r6, r6, r5
 ; ARMV7-NEXT:    movwne r4, #1
+; ARMV7-NEXT:    cmp r3, #0
+; ARMV7-NEXT:    movwne r3, #1
+; ARMV7-NEXT:    cmp lr, #0
+; ARMV7-NEXT:    movwne lr, #1
+; ARMV7-NEXT:    cmp r11, #0
+; ARMV7-NEXT:    movwne r11, #1
+; ARMV7-NEXT:    adds r2, r2, r5
+; ARMV7-NEXT:    and r3, lr, r3
+; ARMV7-NEXT:    str r2, [r9, #8]
+; ARMV7-NEXT:    adcs r1, r7, r1
+; ARMV7-NEXT:    str r1, [r9, #12]
+; ARMV7-NEXT:    orr r1, r3, r11
+; ARMV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
 ; ARMV7-NEXT:    orr r1, r1, r4
-; ARMV7-NEXT:    umull r5, r4, r9, r2
-; ARMV7-NEXT:    adds r6, r4, r6
-; ARMV7-NEXT:    adc r4, r11, #0
-; ARMV7-NEXT:    orr r1, r1, r4
-; ARMV7-NEXT:    mov r4, #0
-; ARMV7-NEXT:    orr r12, r7, r1
-; ARMV7-NEXT:    adds r7, r8, r5
-; ARMV7-NEXT:    umull r8, r5, r2, lr
-; ARMV7-NEXT:    adc r6, r10, r6
-; ARMV7-NEXT:    umlal r5, r4, r3, lr
-; ARMV7-NEXT:    ldr lr, [sp, #60]
-; ARMV7-NEXT:    umull r2, r1, r2, lr
-; ARMV7-NEXT:    adds r5, r2, r5
-; ARMV7-NEXT:    adcs r1, r4, r1
-; ARMV7-NEXT:    adc r4, r11, #0
-; ARMV7-NEXT:    umlal r1, r4, r3, lr
-; ARMV7-NEXT:    adds r2, r1, r7
-; ARMV7-NEXT:    adcs r3, r4, r6
-; ARMV7-NEXT:    adc r1, r11, #0
-; ARMV7-NEXT:    orr r1, r12, r1
-; ARMV7-NEXT:    b .LBB0_8
-; ARMV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
-; ARMV7-NEXT:    orrs r1, r9, r4
-; ARMV7-NEXT:    beq .LBB0_7
-; ARMV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
-; ARMV7-NEXT:    umull r1, r5, r2, r9
-; ARMV7-NEXT:    mov r6, #0
-; ARMV7-NEXT:    mov r11, #0
-; ARMV7-NEXT:    umlal r5, r6, r3, r9
-; ARMV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; ARMV7-NEXT:    umull r1, r8, r2, r4
-; ARMV7-NEXT:    adds r1, r1, r5
-; ARMV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; ARMV7-NEXT:    adcs r5, r6, r8
-; ARMV7-NEXT:    adc r6, r11, #0
-; ARMV7-NEXT:    umull r8, r11, r9, r10
-; ARMV7-NEXT:    mla r1, r9, r7, r11
-; ARMV7-NEXT:    umlal r5, r6, r3, r4
-; ARMV7-NEXT:    mla r1, r4, r10, r1
-; ARMV7-NEXT:    adds r4, r5, r8
-; ARMV7-NEXT:    umull r8, r5, r2, lr
-; ARMV7-NEXT:    adc r9, r6, r1
-; ARMV7-NEXT:    mov r6, #0
-; ARMV7-NEXT:    umlal r5, r6, r3, lr
-; ARMV7-NEXT:    umull r2, r1, r2, r12
-; ARMV7-NEXT:    adds r5, r2, r5
-; ARMV7-NEXT:    mov r2, #0
-; ARMV7-NEXT:    adcs r1, r6, r1
-; ARMV7-NEXT:    adc r2, r2, #0
-; ARMV7-NEXT:    umlal r1, r2, r3, r12
-; ARMV7-NEXT:    umull r3, r6, lr, r10
-; ARMV7-NEXT:    mla r7, lr, r7, r6
-; ARMV7-NEXT:    adds r1, r1, r3
-; ARMV7-NEXT:    mla r7, r12, r10, r7
-; ARMV7-NEXT:    adc r3, r2, r7
-; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; ARMV7-NEXT:    adds r2, r1, r2
-; ARMV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; ARMV7-NEXT:    adcs r3, r3, r1
-; ARMV7-NEXT:    adcs r1, r4, #0
-; ARMV7-NEXT:    adc r7, r9, #0
-; ARMV7-NEXT:    b .LBB0_6
-; ARMV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
-; ARMV7-NEXT:    umull r1, r5, lr, r10
-; ARMV7-NEXT:    mov r11, #0
-; ARMV7-NEXT:    umull r6, r8, lr, r7
-; ARMV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; ARMV7-NEXT:    mov r1, #0
-; ARMV7-NEXT:    umlal r5, r1, r12, r10
-; ARMV7-NEXT:    adds r5, r6, r5
-; ARMV7-NEXT:    str r5, [sp, #4] @ 4-byte Spill
-; ARMV7-NEXT:    adcs r1, r1, r8
-; ARMV7-NEXT:    adc r5, r11, #0
-; ARMV7-NEXT:    umull r8, r11, r10, r9
-; ARMV7-NEXT:    mla r6, r10, r4, r11
-; ARMV7-NEXT:    umlal r1, r5, r12, r7
-; ARMV7-NEXT:    mla r6, r7, r9, r6
-; ARMV7-NEXT:    mov r7, #0
-; ARMV7-NEXT:    adds r10, r1, r8
-; ARMV7-NEXT:    adc r11, r5, r6
-; ARMV7-NEXT:    umull r8, r5, lr, r2
-; ARMV7-NEXT:    umlal r5, r7, r12, r2
-; ARMV7-NEXT:    umull r1, r6, lr, r3
-; ARMV7-NEXT:    adds r5, r1, r5
-; ARMV7-NEXT:    adcs r1, r7, r6
-; ARMV7-NEXT:    mov r7, #0
-; ARMV7-NEXT:    adc r7, r7, #0
-; ARMV7-NEXT:    umlal r1, r7, r12, r3
-; ARMV7-NEXT:    umull r12, r6, r2, r9
-; ARMV7-NEXT:    mla r2, r2, r4, r6
-; ARMV7-NEXT:    adds r1, r1, r12
-; ARMV7-NEXT:    mla r2, r3, r9, r2
-; ARMV7-NEXT:    adc r3, r7, r2
-; ARMV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; ARMV7-NEXT:    adds r2, r1, r2
-; ARMV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; ARMV7-NEXT:    adcs r3, r3, r1
-; ARMV7-NEXT:    adcs r1, r10, #0
-; ARMV7-NEXT:    adc r7, r11, #0
-; ARMV7-NEXT:  .LBB0_6: @ %overflow.res
-; ARMV7-NEXT:    orrs r1, r1, r7
-; ARMV7-NEXT:    movwne r1, #1
-; ARMV7-NEXT:    b .LBB0_8
-; ARMV7-NEXT:  .LBB0_7: @ %overflow.no
-; ARMV7-NEXT:    umull r1, r11, r2, lr
-; ARMV7-NEXT:    mov r6, #0
-; ARMV7-NEXT:    umlal r11, r6, r3, lr
-; ARMV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; ARMV7-NEXT:    mov r1, #0
-; ARMV7-NEXT:    umull r5, r8, r2, r12
-; ARMV7-NEXT:    adds r5, r5, r11
-; ARMV7-NEXT:    adcs r6, r6, r8
-; ARMV7-NEXT:    adc r11, r1, #0
-; ARMV7-NEXT:    umlal r6, r11, r3, r12
-; ARMV7-NEXT:    umull r8, r12, lr, r10
-; ARMV7-NEXT:    str r6, [sp] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r6, [sp, #60]
-; ARMV7-NEXT:    mla r7, lr, r7, r12
-; ARMV7-NEXT:    str r8, [sp, #8] @ 4-byte Spill
-; ARMV7-NEXT:    ldr r8, [sp, #4] @ 4-byte Reload
-; ARMV7-NEXT:    mla r12, r6, r10, r7
-; ARMV7-NEXT:    umull lr, r7, r9, r2
-; ARMV7-NEXT:    mla r3, r9, r3, r7
-; ARMV7-NEXT:    mla r2, r4, r2, r3
-; ARMV7-NEXT:    ldr r3, [sp, #8] @ 4-byte Reload
-; ARMV7-NEXT:    adds r3, lr, r3
-; ARMV7-NEXT:    adc r7, r2, r12
-; ARMV7-NEXT:    ldr r2, [sp] @ 4-byte Reload
-; ARMV7-NEXT:    adds r2, r2, r3
-; ARMV7-NEXT:    adc r3, r11, r7
-; ARMV7-NEXT:  .LBB0_8: @ %overflow.res
-; ARMV7-NEXT:    str r8, [r0]
-; ARMV7-NEXT:    and r1, r1, #1
-; ARMV7-NEXT:    str r5, [r0, #4]
-; ARMV7-NEXT:    str r2, [r0, #8]
-; ARMV7-NEXT:    str r3, [r0, #12]
-; ARMV7-NEXT:    strb r1, [r0, #16]
-; ARMV7-NEXT:    add sp, sp, #12
+; ARMV7-NEXT:    orr r1, r1, r2
+; ARMV7-NEXT:    orr r0, r0, r1
+; ARMV7-NEXT:    adc r1, r12, #0
+; ARMV7-NEXT:    orr r0, r0, r1
+; ARMV7-NEXT:    and r0, r0, #1
+; ARMV7-NEXT:    strb r0, [r9, #16]
+; ARMV7-NEXT:    add sp, sp, #44
 ; ARMV7-NEXT:    pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
index 91ea1a1ad75e9..64d9831442970 100644
--- a/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/ARM/umulo-64-legalisation-lowering.ll
@@ -4,18 +4,12 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; ARMV6-LABEL: mulodi_test:
-; ARMV6:       @ %bb.0: @ %overflow.entry
+; ARMV6:       @ %bb.0: @ %start
 ; ARMV6-NEXT:    push {r4, r5, r11, lr}
-; ARMV6-NEXT:    cmp r1, #0
-; ARMV6-NEXT:    beq .LBB0_3
-; ARMV6-NEXT:  @ %bb.1: @ %overflow.lhs
-; ARMV6-NEXT:    cmp r3, #0
-; ARMV6-NEXT:    beq .LBB0_5
-; ARMV6-NEXT:  @ %bb.2: @ %overflow
-; ARMV6-NEXT:    umull r12, r4, r1, r2
-; ARMV6-NEXT:    umull lr, r5, r3, r0
-; ARMV6-NEXT:    cmp r4, #0
-; ARMV6-NEXT:    movne r4, #1
+; ARMV6-NEXT:    umull r12, lr, r1, r2
+; ARMV6-NEXT:    umull r4, r5, r3, r0
+; ARMV6-NEXT:    cmp lr, #0
+; ARMV6-NEXT:    movne lr, #1
 ; ARMV6-NEXT:    cmp r3, #0
 ; ARMV6-NEXT:    movne r3, #1
 ; ARMV6-NEXT:    cmp r1, #0
@@ -23,105 +17,38 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; ARMV6-NEXT:    movne r1, #1
 ; ARMV6-NEXT:    and r1, r1, r3
 ; ARMV6-NEXT:    cmp r5, #0
-; ARMV6-NEXT:    orr r1, r1, r4
+; ARMV6-NEXT:    orr r1, r1, lr
 ; ARMV6-NEXT:    movne r5, #1
 ; ARMV6-NEXT:    orr r3, r1, r5
-; ARMV6-NEXT:    add r1, r12, lr
+; ARMV6-NEXT:    add r1, r12, r4
 ; ARMV6-NEXT:    adds r1, r2, r1
 ; ARMV6-NEXT:    mov r5, #0
 ; ARMV6-NEXT:    adc r2, r5, #0
-; ARMV6-NEXT:    orr r12, r3, r2
-; ARMV6-NEXT:    and r2, r12, #1
-; ARMV6-NEXT:    pop {r4, r5, r11, pc}
-; ARMV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
-; ARMV6-NEXT:    cmp r3, #0
-; ARMV6-NEXT:    beq .LBB0_7
-; ARMV6-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
-; ARMV6-NEXT:    mov lr, r0
-; ARMV6-NEXT:    umull r0, r4, r0, r2
-; ARMV6-NEXT:    mov r12, r1
-; ARMV6-NEXT:    mla r1, r1, r2, r4
-; ARMV6-NEXT:    mul r12, r12, r3
-; ARMV6-NEXT:    umlal r1, r12, lr, r3
-; ARMV6-NEXT:    b .LBB0_6
-; ARMV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
-; ARMV6-NEXT:    mov r12, r0
-; ARMV6-NEXT:    umull r0, lr, r2, r0
-; ARMV6-NEXT:    mov r4, r1
-; ARMV6-NEXT:    mla r1, r3, r12, lr
-; ARMV6-NEXT:    mul r12, r3, r4
-; ARMV6-NEXT:    umlal r1, r12, r2, r4
-; ARMV6-NEXT:  .LBB0_6: @ %overflow.res
-; ARMV6-NEXT:    cmp r12, #0
-; ARMV6-NEXT:    movne r12, #1
-; ARMV6-NEXT:    and r2, r12, #1
-; ARMV6-NEXT:    pop {r4, r5, r11, pc}
-; ARMV6-NEXT:  .LBB0_7: @ %overflow.no
-; ARMV6-NEXT:    mov r12, r0
-; ARMV6-NEXT:    umull r0, r4, r0, r2
-; ARMV6-NEXT:    mla r3, r12, r3, r4
-; ARMV6-NEXT:    mov r12, #0
-; ARMV6-NEXT:    mla r1, r1, r2, r3
-; ARMV6-NEXT:    and r2, r12, #1
+; ARMV6-NEXT:    orr r2, r3, r2
 ; ARMV6-NEXT:    pop {r4, r5, r11, pc}
 ;
 ; ARMV7-LABEL: mulodi_test:
-; ARMV7:       @ %bb.0: @ %overflow.entry
+; ARMV7:       @ %bb.0: @ %start
 ; ARMV7-NEXT:    push {r4, r5, r11, lr}
-; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    beq .LBB0_3
-; ARMV7-NEXT:  @ %bb.1: @ %overflow.lhs
-; ARMV7-NEXT:    cmp r3, #0
-; ARMV7-NEXT:    beq .LBB0_5
-; ARMV7-NEXT:  @ %bb.2: @ %overflow
-; ARMV7-NEXT:    umull lr, r4, r3, r0
+; ARMV7-NEXT:    umull r12, lr, r3, r0
 ; ARMV7-NEXT:    cmp r3, #0
 ; ARMV7-NEXT:    movwne r3, #1
 ; ARMV7-NEXT:    cmp r1, #0
-; ARMV7-NEXT:    umull r0, r12, r0, r2
+; ARMV7-NEXT:    umull r0, r4, r0, r2
 ; ARMV7-NEXT:    umull r2, r5, r1, r2
 ; ARMV7-NEXT:    movwne r1, #1
 ; ARMV7-NEXT:    and r1, r1, r3
 ; ARMV7-NEXT:    cmp r5, #0
 ; ARMV7-NEXT:    movwne r5, #1
-; ARMV7-NEXT:    cmp r4, #0
+; ARMV7-NEXT:    cmp lr, #0
 ; ARMV7-NEXT:    orr r1, r1, r5
-; ARMV7-NEXT:    movwne r4, #1
-; ARMV7-NEXT:    orr r3, r1, r4
-; ARMV7-NEXT:    add r1, r2, lr
+; ARMV7-NEXT:    movwne lr, #1
+; ARMV7-NEXT:    orr r3, r1, lr
+; ARMV7-NEXT:    add r1, r2, r12
 ; ARMV7-NEXT:    mov r2, #0
-; ARMV7-NEXT:    adds r1, r12, r1
+; ARMV7-NEXT:    adds r1, r4, r1
 ; ARMV7-NEXT:    adc r2, r2, #0
-; ARMV7-NEXT:    orr r12, r3, r2
-; ARMV7-NEXT:    and r2, r12, #1
-; ARMV7-NEXT:    pop {r4, r5, r11, pc}
-; ARMV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
-; ARMV7-NEXT:    mov r5, r0
-; ARMV7-NEXT:    umull r0, r4, r0, r2
-; ARMV7-NEXT:    cmp r3, #0
-; ARMV7-NEXT:    beq .LBB0_7
-; ARMV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
-; ARMV7-NEXT:    mul r12, r1, r3
-; ARMV7-NEXT:    mla r1, r1, r2, r4
-; ARMV7-NEXT:    umlal r1, r12, r5, r3
-; ARMV7-NEXT:    b .LBB0_6
-; ARMV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
-; ARMV7-NEXT:    mov lr, r0
-; ARMV7-NEXT:    umull r0, r4, r2, r0
-; ARMV7-NEXT:    mov r5, r1
-; ARMV7-NEXT:    mul r12, r3, r1
-; ARMV7-NEXT:    mla r1, r3, lr, r4
-; ARMV7-NEXT:    umlal r1, r12, r2, r5
-; ARMV7-NEXT:  .LBB0_6: @ %overflow.res
-; ARMV7-NEXT:    cmp r12, #0
-; ARMV7-NEXT:    movwne r12, #1
-; ARMV7-NEXT:    and r2, r12, #1
-; ARMV7-NEXT:    pop {r4, r5, r11, pc}
-; ARMV7-NEXT:  .LBB0_7: @ %overflow.no
-; ARMV7-NEXT:    mla r3, r5, r3, r4
-; ARMV7-NEXT:    mov r12, #0
-; ARMV7-NEXT:    mla r1, r1, r2, r3
-; ARMV7-NEXT:    and r2, r12, #1
+; ARMV7-NEXT:    orr r2, r3, r2
 ; ARMV7-NEXT:    pop {r4, r5, r11, pc}
 start:
   %0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %l, i64 %r) #2
diff --git a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
index 5498a0741bc23..968c06136225d 100644
--- a/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/LoongArch/smul-with-overflow.ll
@@ -4,13 +4,7 @@
 
 define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; LA32-LABEL: smuloi64:
-; LA32:       # %bb.0: # %overflow.entry
-; LA32-NEXT:    srai.w $a6, $a0, 31
-; LA32-NEXT:    srai.w $a5, $a2, 31
-; LA32-NEXT:    beq $a1, $a6, .LBB0_3
-; LA32-NEXT:  # %bb.1: # %overflow.lhs
-; LA32-NEXT:    beq $a3, $a5, .LBB0_6
-; LA32-NEXT:  # %bb.2: # %overflow
+; LA32:       # %bb.0:
 ; LA32-NEXT:    mulh.wu $a5, $a0, $a2
 ; LA32-NEXT:    mul.w $a6, $a1, $a2
 ; LA32-NEXT:    add.w $a5, $a6, $a5
@@ -44,138 +38,11 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; LA32-NEXT:    xor $a1, $a1, $a6
 ; LA32-NEXT:    xor $a3, $a3, $a6
 ; LA32-NEXT:    or $a1, $a3, $a1
-; LA32-NEXT:    sltu $a6, $zero, $a1
-; LA32-NEXT:    b .LBB0_9
-; LA32-NEXT:  .LBB0_3: # %overflow.no.lhs
-; LA32-NEXT:    beq $a3, $a5, .LBB0_8
-; LA32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a1, .LBB0_10
-; LA32-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; LA32-NEXT:    move $a5, $a0
-; LA32-NEXT:    move $a6, $a1
-; LA32-NEXT:    bgez $a1, .LBB0_11
-; LA32-NEXT:    b .LBB0_12
-; LA32-NEXT:  .LBB0_6: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a3, .LBB0_14
-; LA32-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; LA32-NEXT:    move $a5, $a2
-; LA32-NEXT:    move $a6, $a3
-; LA32-NEXT:    bgez $a3, .LBB0_15
-; LA32-NEXT:    b .LBB0_16
-; LA32-NEXT:  .LBB0_8: # %overflow.no
-; LA32-NEXT:    move $a6, $zero
-; LA32-NEXT:    mulh.wu $a5, $a0, $a2
-; LA32-NEXT:    mul.w $a3, $a0, $a3
-; LA32-NEXT:    add.w $a3, $a5, $a3
-; LA32-NEXT:    mul.w $a1, $a1, $a2
-; LA32-NEXT:    add.w $a5, $a3, $a1
-; LA32-NEXT:  .LBB0_9: # %overflow.res
+; LA32-NEXT:    sltu $a1, $zero, $a1
 ; LA32-NEXT:    mul.w $a0, $a0, $a2
-; LA32-NEXT:    b .LBB0_27
-; LA32-NEXT:  .LBB0_10:
-; LA32-NEXT:    sub.w $a5, $zero, $a0
-; LA32-NEXT:    sltu $a6, $zero, $a0
-; LA32-NEXT:    add.w $a6, $a1, $a6
-; LA32-NEXT:    sub.w $a6, $zero, $a6
-; LA32-NEXT:    bltz $a1, .LBB0_12
-; LA32-NEXT:  .LBB0_11: # %overflow.no.lhs.only
-; LA32-NEXT:    move $a6, $a1
-; LA32-NEXT:    move $a5, $a0
-; LA32-NEXT:  .LBB0_12: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a3, .LBB0_18
-; LA32-NEXT:  # %bb.13: # %overflow.no.lhs.only
-; LA32-NEXT:    move $a7, $a2
-; LA32-NEXT:    move $a0, $a3
-; LA32-NEXT:    b .LBB0_19
-; LA32-NEXT:  .LBB0_14:
-; LA32-NEXT:    sub.w $a5, $zero, $a2
-; LA32-NEXT:    sltu $a6, $zero, $a2
-; LA32-NEXT:    add.w $a6, $a3, $a6
-; LA32-NEXT:    sub.w $a6, $zero, $a6
-; LA32-NEXT:    bltz $a3, .LBB0_16
-; LA32-NEXT:  .LBB0_15: # %overflow.no.rhs.only
-; LA32-NEXT:    move $a6, $a3
-; LA32-NEXT:    move $a5, $a2
-; LA32-NEXT:  .LBB0_16: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a1, .LBB0_22
-; LA32-NEXT:  # %bb.17: # %overflow.no.rhs.only
-; LA32-NEXT:    move $a7, $a0
-; LA32-NEXT:    move $a2, $a1
-; LA32-NEXT:    b .LBB0_23
-; LA32-NEXT:  .LBB0_18:
-; LA32-NEXT:    sub.w $a7, $zero, $a2
-; LA32-NEXT:    sltu $a0, $zero, $a2
-; LA32-NEXT:    add.w $a0, $a3, $a0
-; LA32-NEXT:    sub.w $a0, $zero, $a0
-; LA32-NEXT:  .LBB0_19: # %overflow.no.lhs.only
-; LA32-NEXT:    slti $a1, $a1, 0
-; LA32-NEXT:    slti $t0, $a3, 0
-; LA32-NEXT:    bltz $a3, .LBB0_21
-; LA32-NEXT:  # %bb.20: # %overflow.no.lhs.only
-; LA32-NEXT:    move $a0, $a3
-; LA32-NEXT:    move $a7, $a2
-; LA32-NEXT:  .LBB0_21: # %overflow.no.lhs.only
-; LA32-NEXT:    mulh.wu $a2, $a5, $a7
-; LA32-NEXT:    mul.w $a3, $a6, $a7
-; LA32-NEXT:    add.w $a2, $a2, $a3
-; LA32-NEXT:    mul.w $a3, $a5, $a7
-; LA32-NEXT:    mul.w $a6, $a6, $a0
-; LA32-NEXT:    mulh.wu $a7, $a5, $a0
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    mul.w $a0, $a5, $a0
-; LA32-NEXT:    add.w $a5, $a2, $a0
-; LA32-NEXT:    sltu $a0, $a5, $a2
-; LA32-NEXT:    add.w $a2, $a6, $a0
-; LA32-NEXT:    xor $a1, $t0, $a1
-; LA32-NEXT:    sub.w $a6, $zero, $a1
-; LA32-NEXT:    xor $a0, $a3, $a6
-; LA32-NEXT:    add.w $a0, $a0, $a1
-; LA32-NEXT:    sltu $a1, $a0, $a1
-; LA32-NEXT:    xor $a3, $a5, $a6
-; LA32-NEXT:    add.w $a5, $a3, $a1
-; LA32-NEXT:    sltu $a1, $a5, $a1
-; LA32-NEXT:    xor $a2, $a2, $a6
-; LA32-NEXT:    b .LBB0_26
-; LA32-NEXT:  .LBB0_22:
-; LA32-NEXT:    sub.w $a7, $zero, $a0
-; LA32-NEXT:    sltu $a2, $zero, $a0
-; LA32-NEXT:    add.w $a2, $a1, $a2
-; LA32-NEXT:    sub.w $a2, $zero, $a2
-; LA32-NEXT:  .LBB0_23: # %overflow.no.rhs.only
-; LA32-NEXT:    slti $a3, $a3, 0
-; LA32-NEXT:    slti $t0, $a1, 0
-; LA32-NEXT:    bltz $a1, .LBB0_25
-; LA32-NEXT:  # %bb.24: # %overflow.no.rhs.only
-; LA32-NEXT:    move $a2, $a1
-; LA32-NEXT:    move $a7, $a0
-; LA32-NEXT:  .LBB0_25: # %overflow.no.rhs.only
-; LA32-NEXT:    mulh.wu $a0, $a5, $a7
-; LA32-NEXT:    mul.w $a1, $a6, $a7
-; LA32-NEXT:    add.w $a0, $a0, $a1
-; LA32-NEXT:    mul.w $a1, $a5, $a7
-; LA32-NEXT:    mul.w $a6, $a6, $a2
-; LA32-NEXT:    mulh.wu $a7, $a5, $a2
-; LA32-NEXT:    add.w $a6, $a7, $a6
-; LA32-NEXT:    mul.w $a2, $a5, $a2
-; LA32-NEXT:    add.w $a2, $a0, $a2
-; LA32-NEXT:    sltu $a0, $a2, $a0
-; LA32-NEXT:    add.w $a6, $a6, $a0
-; LA32-NEXT:    xor $a3, $a3, $t0
-; LA32-NEXT:    sub.w $a7, $zero, $a3
-; LA32-NEXT:    xor $a0, $a1, $a7
-; LA32-NEXT:    add.w $a0, $a0, $a3
-; LA32-NEXT:    sltu $a1, $a0, $a3
-; LA32-NEXT:    xor $a2, $a2, $a7
-; LA32-NEXT:    add.w $a5, $a2, $a1
-; LA32-NEXT:    sltu $a1, $a5, $a1
-; LA32-NEXT:    xor $a2, $a6, $a7
-; LA32-NEXT:  .LBB0_26: # %overflow.res
-; LA32-NEXT:    add.w $a1, $a2, $a1
-; LA32-NEXT:    sltu $a6, $zero, $a1
-; LA32-NEXT:  .LBB0_27: # %overflow.res
 ; LA32-NEXT:    st.w $a0, $a4, 0
-; LA32-NEXT:    andi $a0, $a6, 1
 ; LA32-NEXT:    st.w $a5, $a4, 4
+; LA32-NEXT:    move $a0, $a1
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi64:
@@ -196,7 +63,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 
 define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-LABEL: smuloi128:
-; LA32:       # %bb.0: # %overflow.entry
+; LA32:       # %bb.0:
 ; LA32-NEXT:    addi.w $sp, $sp, -48
 ; LA32-NEXT:    .cfi_def_cfa_offset 48
 ; LA32-NEXT:    st.w $ra, $sp, 44 # 4-byte Folded Spill
@@ -221,608 +88,198 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-NEXT:    .cfi_offset 29, -36
 ; LA32-NEXT:    .cfi_offset 30, -40
 ; LA32-NEXT:    .cfi_offset 31, -44
-; LA32-NEXT:    ld.w $a3, $a1, 12
-; LA32-NEXT:    ld.w $a7, $a1, 8
-; LA32-NEXT:    ld.w $a5, $a1, 0
-; LA32-NEXT:    ld.w $a6, $a0, 0
-; LA32-NEXT:    ld.w $t0, $a0, 4
-; LA32-NEXT:    ld.w $a4, $a0, 12
-; LA32-NEXT:    ld.w $a0, $a0, 8
-; LA32-NEXT:    ld.w $a1, $a1, 4
-; LA32-NEXT:    srai.w $t1, $t0, 31
-; LA32-NEXT:    xor $t2, $a4, $t1
-; LA32-NEXT:    xor $t1, $a0, $t1
-; LA32-NEXT:    or $t2, $t1, $t2
-; LA32-NEXT:    srai.w $t1, $a1, 31
-; LA32-NEXT:    beq $t2, $zero, .LBB1_11
-; LA32-NEXT:  # %bb.1: # %overflow.lhs
-; LA32-NEXT:    xor $t2, $a7, $t1
-; LA32-NEXT:    xor $t1, $a3, $t1
-; LA32-NEXT:    or $t1, $t2, $t1
-; LA32-NEXT:    beq $t1, $zero, .LBB1_14
-; LA32-NEXT:  # %bb.2: # %overflow
-; LA32-NEXT:    mulh.wu $t1, $a0, $a5
-; LA32-NEXT:    mul.w $t2, $a4, $a5
-; LA32-NEXT:    add.w $t1, $t2, $t1
-; LA32-NEXT:    sltu $t2, $t1, $t2
-; LA32-NEXT:    mulh.wu $t3, $a4, $a5
-; LA32-NEXT:    add.w $t5, $t3, $t2
-; LA32-NEXT:    mul.w $t3, $a0, $a1
-; LA32-NEXT:    add.w $t2, $t3, $t1
-; LA32-NEXT:    sltu $t1, $t2, $t3
-; LA32-NEXT:    mulh.wu $t3, $a0, $a1
-; LA32-NEXT:    add.w $t1, $t3, $t1
-; LA32-NEXT:    add.w $t1, $t5, $t1
-; LA32-NEXT:    mul.w $t6, $a4, $a1
-; LA32-NEXT:    add.w $t7, $t6, $t1
-; LA32-NEXT:    srai.w $t3, $a4, 31
-; LA32-NEXT:    mul.w $t8, $a5, $t3
-; LA32-NEXT:    add.w $t4, $t7, $t8
-; LA32-NEXT:    sltu $fp, $t4, $t7
+; LA32-NEXT:    ld.w $a5, $a1, 12
+; LA32-NEXT:    ld.w $a6, $a1, 8
+; LA32-NEXT:    ld.w $t1, $a0, 4
+; LA32-NEXT:    ld.w $a3, $a1, 0
+; LA32-NEXT:    ld.w $a7, $a0, 8
+; LA32-NEXT:    ld.w $t0, $a0, 12
+; LA32-NEXT:    ld.w $a4, $a0, 0
+; LA32-NEXT:    ld.w $t4, $a1, 4
+; LA32-NEXT:    mulh.wu $a0, $a7, $a3
+; LA32-NEXT:    mul.w $a1, $t0, $a3
+; LA32-NEXT:    add.w $a0, $a1, $a0
+; LA32-NEXT:    sltu $a1, $a0, $a1
+; LA32-NEXT:    mulh.wu $t2, $t0, $a3
+; LA32-NEXT:    add.w $a1, $t2, $a1
+; LA32-NEXT:    mul.w $t3, $a7, $t4
+; LA32-NEXT:    add.w $t2, $t3, $a0
+; LA32-NEXT:    sltu $a0, $t2, $t3
+; LA32-NEXT:    mulh.wu $t3, $a7, $t4
+; LA32-NEXT:    add.w $a0, $t3, $a0
+; LA32-NEXT:    add.w $t5, $a1, $a0
+; LA32-NEXT:    mul.w $t6, $t0, $t4
+; LA32-NEXT:    add.w $t7, $t6, $t5
+; LA32-NEXT:    srai.w $a0, $t0, 31
+; LA32-NEXT:    mul.w $t8, $a3, $a0
+; LA32-NEXT:    add.w $t3, $t7, $t8
+; LA32-NEXT:    sltu $fp, $t3, $t7
 ; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $t1, $t1, $t5
-; LA32-NEXT:    mulh.wu $t5, $a4, $a1
-; LA32-NEXT:    add.w $t1, $t5, $t1
-; LA32-NEXT:    add.w $t1, $t1, $t6
-; LA32-NEXT:    mulh.wu $t5, $a5, $t3
+; LA32-NEXT:    sltu $a1, $t5, $a1
+; LA32-NEXT:    mulh.wu $t5, $t0, $t4
+; LA32-NEXT:    add.w $a1, $t5, $a1
+; LA32-NEXT:    add.w $a1, $a1, $t6
+; LA32-NEXT:    mulh.wu $t5, $a3, $a0
 ; LA32-NEXT:    add.w $t5, $t5, $t8
-; LA32-NEXT:    mul.w $t6, $a1, $t3
+; LA32-NEXT:    mul.w $t6, $t4, $a0
 ; LA32-NEXT:    add.w $t5, $t5, $t6
-; LA32-NEXT:    add.w $t5, $t1, $t5
-; LA32-NEXT:    mulh.wu $t1, $a6, $a5
-; LA32-NEXT:    mul.w $t6, $t0, $a5
-; LA32-NEXT:    add.w $t1, $t6, $t1
-; LA32-NEXT:    sltu $t6, $t1, $t6
-; LA32-NEXT:    mulh.wu $t7, $t0, $a5
-; LA32-NEXT:    add.w $t6, $t7, $t6
-; LA32-NEXT:    mul.w $t7, $a6, $a1
-; LA32-NEXT:    add.w $t1, $t7, $t1
-; LA32-NEXT:    sltu $t7, $t1, $t7
-; LA32-NEXT:    mulh.wu $t8, $a6, $a1
-; LA32-NEXT:    add.w $t7, $t8, $t7
-; LA32-NEXT:    add.w $t7, $t6, $t7
-; LA32-NEXT:    mul.w $t8, $t0, $a1
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    add.w $t7, $t8, $t7
-; LA32-NEXT:    sltu $t8, $t7, $t8
-; LA32-NEXT:    mulh.wu $a1, $t0, $a1
-; LA32-NEXT:    add.w $a1, $a1, $t6
-; LA32-NEXT:    add.w $a1, $a1, $t8
-; LA32-NEXT:    add.w $t8, $t2, $a1
-; LA32-NEXT:    mul.w $t6, $a0, $a5
-; LA32-NEXT:    add.w $a1, $t6, $t7
+; LA32-NEXT:    add.w $t8, $a1, $t5
+; LA32-NEXT:    mulh.wu $a1, $a4, $a3
+; LA32-NEXT:    mul.w $t5, $t1, $a3
+; LA32-NEXT:    add.w $a1, $t5, $a1
+; LA32-NEXT:    sltu $t5, $a1, $t5
+; LA32-NEXT:    mulh.wu $t6, $t1, $a3
+; LA32-NEXT:    add.w $t5, $t6, $t5
+; LA32-NEXT:    mul.w $t6, $a4, $t4
+; LA32-NEXT:    add.w $a1, $t6, $a1
 ; LA32-NEXT:    sltu $t6, $a1, $t6
-; LA32-NEXT:    add.w $t7, $t8, $t6
-; LA32-NEXT:    add.w $t5, $t5, $fp
-; LA32-NEXT:    beq $t7, $t2, .LBB1_4
-; LA32-NEXT:  # %bb.3: # %overflow
-; LA32-NEXT:    sltu $t6, $t7, $t2
-; LA32-NEXT:  .LBB1_4: # %overflow
-; LA32-NEXT:    add.w $t6, $t4, $t6
-; LA32-NEXT:    sltu $t2, $t6, $t4
-; LA32-NEXT:    add.w $t5, $t5, $t2
-; LA32-NEXT:    mulh.wu $t2, $a6, $a7
-; LA32-NEXT:    mul.w $t4, $t0, $a7
-; LA32-NEXT:    add.w $t2, $t4, $t2
-; LA32-NEXT:    sltu $t4, $t2, $t4
-; LA32-NEXT:    mulh.wu $t8, $t0, $a7
-; LA32-NEXT:    add.w $s0, $t8, $t4
-; LA32-NEXT:    mul.w $t4, $a6, $a3
-; LA32-NEXT:    add.w $t8, $t4, $t2
-; LA32-NEXT:    sltu $t2, $t8, $t4
-; LA32-NEXT:    mulh.wu $t4, $a6, $a3
-; LA32-NEXT:    add.w $t2, $t4, $t2
+; LA32-NEXT:    mulh.wu $t7, $a4, $t4
+; LA32-NEXT:    add.w $t6, $t7, $t6
+; LA32-NEXT:    add.w $t6, $t5, $t6
+; LA32-NEXT:    mul.w $t7, $t1, $t4
+; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    add.w $t6, $t7, $t6
+; LA32-NEXT:    sltu $t7, $t6, $t7
+; LA32-NEXT:    mulh.wu $t4, $t1, $t4
+; LA32-NEXT:    add.w $t4, $t4, $t5
+; LA32-NEXT:    add.w $t4, $t4, $t7
+; LA32-NEXT:    add.w $t4, $t2, $t4
+; LA32-NEXT:    mul.w $t5, $a7, $a3
+; LA32-NEXT:    add.w $t6, $t5, $t6
+; LA32-NEXT:    sltu $t5, $t6, $t5
+; LA32-NEXT:    add.w $t7, $t4, $t5
+; LA32-NEXT:    add.w $t4, $t8, $fp
+; LA32-NEXT:    beq $t7, $t2, .LBB1_2
+; LA32-NEXT:  # %bb.1:
+; LA32-NEXT:    sltu $t5, $t7, $t2
+; LA32-NEXT:  .LBB1_2:
+; LA32-NEXT:    add.w $t5, $t3, $t5
+; LA32-NEXT:    sltu $t2, $t5, $t3
+; LA32-NEXT:    add.w $t4, $t4, $t2
+; LA32-NEXT:    mulh.wu $t2, $a4, $a6
+; LA32-NEXT:    mul.w $t3, $t1, $a6
+; LA32-NEXT:    add.w $t2, $t3, $t2
+; LA32-NEXT:    sltu $t3, $t2, $t3
+; LA32-NEXT:    mulh.wu $t8, $t1, $a6
+; LA32-NEXT:    add.w $s0, $t8, $t3
+; LA32-NEXT:    mul.w $t3, $a4, $a5
+; LA32-NEXT:    add.w $t8, $t3, $t2
+; LA32-NEXT:    sltu $t2, $t8, $t3
+; LA32-NEXT:    mulh.wu $t3, $a4, $a5
+; LA32-NEXT:    add.w $t2, $t3, $t2
 ; LA32-NEXT:    add.w $t2, $s0, $t2
-; LA32-NEXT:    mul.w $s1, $t0, $a3
+; LA32-NEXT:    mul.w $s1, $t1, $a5
 ; LA32-NEXT:    add.w $s2, $s1, $t2
-; LA32-NEXT:    srai.w $t4, $a3, 31
-; LA32-NEXT:    mul.w $s3, $t4, $a6
+; LA32-NEXT:    srai.w $t3, $a5, 31
+; LA32-NEXT:    mul.w $s3, $t3, $a4
 ; LA32-NEXT:    add.w $fp, $s2, $s3
 ; LA32-NEXT:    sltu $s4, $fp, $s2
 ; LA32-NEXT:    sltu $s1, $s2, $s1
 ; LA32-NEXT:    sltu $t2, $t2, $s0
-; LA32-NEXT:    mulh.wu $s0, $t0, $a3
+; LA32-NEXT:    mulh.wu $s0, $t1, $a5
 ; LA32-NEXT:    add.w $t2, $s0, $t2
 ; LA32-NEXT:    add.w $t2, $t2, $s1
-; LA32-NEXT:    mul.w $t0, $t4, $t0
-; LA32-NEXT:    mulh.wu $s0, $t4, $a6
-; LA32-NEXT:    add.w $t0, $s0, $t0
-; LA32-NEXT:    add.w $t0, $t0, $s3
-; LA32-NEXT:    add.w $t0, $t2, $t0
-; LA32-NEXT:    add.w $s0, $t8, $t7
-; LA32-NEXT:    mul.w $t7, $a6, $a7
-; LA32-NEXT:    add.w $t2, $t7, $a1
-; LA32-NEXT:    sltu $t7, $t2, $t7
-; LA32-NEXT:    add.w $a1, $s0, $t7
-; LA32-NEXT:    add.w $t0, $t0, $s4
-; LA32-NEXT:    beq $a1, $t8, .LBB1_6
-; LA32-NEXT:  # %bb.5: # %overflow
-; LA32-NEXT:    sltu $t7, $a1, $t8
-; LA32-NEXT:  .LBB1_6: # %overflow
+; LA32-NEXT:    mul.w $t1, $t3, $t1
+; LA32-NEXT:    mulh.wu $s0, $t3, $a4
+; LA32-NEXT:    add.w $t1, $s0, $t1
+; LA32-NEXT:    add.w $t1, $t1, $s3
+; LA32-NEXT:    add.w $s0, $t2, $t1
+; LA32-NEXT:    add.w $t2, $t8, $t7
+; LA32-NEXT:    mul.w $t7, $a4, $a6
+; LA32-NEXT:    add.w $t1, $t7, $t6
+; LA32-NEXT:    sltu $t7, $t1, $t7
+; LA32-NEXT:    add.w $t2, $t2, $t7
+; LA32-NEXT:    add.w $t6, $s0, $s4
+; LA32-NEXT:    beq $t2, $t8, .LBB1_4
+; LA32-NEXT:  # %bb.3:
+; LA32-NEXT:    sltu $t7, $t2, $t8
+; LA32-NEXT:  .LBB1_4:
 ; LA32-NEXT:    add.w $t7, $fp, $t7
 ; LA32-NEXT:    sltu $t8, $t7, $fp
-; LA32-NEXT:    add.w $t8, $t0, $t8
-; LA32-NEXT:    add.w $t0, $t5, $t8
-; LA32-NEXT:    add.w $t7, $t6, $t7
-; LA32-NEXT:    sltu $s0, $t7, $t6
-; LA32-NEXT:    add.w $s4, $t0, $s0
-; LA32-NEXT:    mulh.wu $t0, $a0, $a7
-; LA32-NEXT:    mul.w $s1, $a4, $a7
-; LA32-NEXT:    add.w $s3, $s1, $t0
-; LA32-NEXT:    mul.w $fp, $a0, $a3
+; LA32-NEXT:    add.w $t8, $t6, $t8
+; LA32-NEXT:    add.w $t6, $t4, $t8
+; LA32-NEXT:    add.w $t7, $t5, $t7
+; LA32-NEXT:    sltu $s0, $t7, $t5
+; LA32-NEXT:    add.w $s4, $t6, $s0
+; LA32-NEXT:    mulh.wu $t5, $a7, $a6
+; LA32-NEXT:    mul.w $s1, $t0, $a6
+; LA32-NEXT:    add.w $s3, $s1, $t5
+; LA32-NEXT:    mul.w $fp, $a7, $a5
 ; LA32-NEXT:    add.w $s2, $fp, $s3
 ; LA32-NEXT:    add.w $t6, $s2, $s4
-; LA32-NEXT:    mul.w $s5, $a0, $a7
-; LA32-NEXT:    add.w $t0, $s5, $t7
-; LA32-NEXT:    sltu $t7, $t0, $s5
+; LA32-NEXT:    mul.w $s5, $a7, $a6
+; LA32-NEXT:    add.w $t5, $s5, $t7
+; LA32-NEXT:    sltu $t7, $t5, $s5
 ; LA32-NEXT:    add.w $t6, $t6, $t7
-; LA32-NEXT:    beq $t6, $s2, .LBB1_8
-; LA32-NEXT:  # %bb.7: # %overflow
+; LA32-NEXT:    beq $t6, $s2, .LBB1_6
+; LA32-NEXT:  # %bb.5:
 ; LA32-NEXT:    sltu $t7, $t6, $s2
-; LA32-NEXT:  .LBB1_8: # %overflow
-; LA32-NEXT:    beq $s4, $t5, .LBB1_10
-; LA32-NEXT:  # %bb.9: # %overflow
-; LA32-NEXT:    sltu $s0, $s4, $t5
-; LA32-NEXT:  .LBB1_10: # %overflow
-; LA32-NEXT:    srai.w $t5, $t5, 31
+; LA32-NEXT:  .LBB1_6:
+; LA32-NEXT:    beq $s4, $t4, .LBB1_8
+; LA32-NEXT:  # %bb.7:
+; LA32-NEXT:    sltu $s0, $s4, $t4
+; LA32-NEXT:  .LBB1_8:
+; LA32-NEXT:    srai.w $t4, $t4, 31
 ; LA32-NEXT:    srai.w $t8, $t8, 31
-; LA32-NEXT:    add.w $t8, $t5, $t8
+; LA32-NEXT:    add.w $t8, $t4, $t8
 ; LA32-NEXT:    add.w $s0, $t8, $s0
 ; LA32-NEXT:    sltu $s1, $s3, $s1
-; LA32-NEXT:    mulh.wu $s3, $a4, $a7
+; LA32-NEXT:    mulh.wu $s3, $t0, $a6
 ; LA32-NEXT:    add.w $s1, $s3, $s1
 ; LA32-NEXT:    sltu $fp, $s2, $fp
-; LA32-NEXT:    mulh.wu $s2, $a0, $a3
+; LA32-NEXT:    mulh.wu $s2, $a7, $a5
 ; LA32-NEXT:    add.w $fp, $s2, $fp
 ; LA32-NEXT:    add.w $fp, $s1, $fp
-; LA32-NEXT:    mul.w $s2, $a4, $a3
+; LA32-NEXT:    mul.w $s2, $t0, $a5
 ; LA32-NEXT:    add.w $s3, $s2, $fp
-; LA32-NEXT:    mul.w $s4, $a7, $t3
-; LA32-NEXT:    mul.w $s5, $t4, $a0
+; LA32-NEXT:    mul.w $s4, $a6, $a0
+; LA32-NEXT:    mul.w $s5, $t3, $a7
 ; LA32-NEXT:    add.w $s6, $s5, $s4
 ; LA32-NEXT:    add.w $s7, $s3, $s6
 ; LA32-NEXT:    add.w $s8, $s7, $s0
 ; LA32-NEXT:    add.w $t7, $s8, $t7
 ; LA32-NEXT:    sltu $ra, $t7, $s8
-; LA32-NEXT:    sltu $t5, $t8, $t5
-; LA32-NEXT:    add.w $t5, $t8, $t5
+; LA32-NEXT:    sltu $t4, $t8, $t4
+; LA32-NEXT:    add.w $t4, $t8, $t4
 ; LA32-NEXT:    sltu $t8, $s0, $t8
-; LA32-NEXT:    add.w $t5, $t5, $t8
+; LA32-NEXT:    add.w $t4, $t4, $t8
 ; LA32-NEXT:    sltu $t8, $s7, $s3
 ; LA32-NEXT:    sltu $s0, $s3, $s2
 ; LA32-NEXT:    sltu $fp, $fp, $s1
-; LA32-NEXT:    mulh.wu $s1, $a4, $a3
+; LA32-NEXT:    mulh.wu $s1, $t0, $a5
 ; LA32-NEXT:    add.w $fp, $s1, $fp
 ; LA32-NEXT:    add.w $fp, $fp, $s0
-; LA32-NEXT:    mulh.wu $a7, $a7, $t3
-; LA32-NEXT:    add.w $a7, $a7, $s4
-; LA32-NEXT:    mul.w $a3, $a3, $t3
-; LA32-NEXT:    add.w $a3, $a7, $a3
-; LA32-NEXT:    mul.w $a4, $t4, $a4
-; LA32-NEXT:    mulh.wu $a0, $t4, $a0
-; LA32-NEXT:    add.w $a0, $a0, $a4
-; LA32-NEXT:    add.w $a0, $a0, $s5
-; LA32-NEXT:    add.w $a0, $a0, $a3
-; LA32-NEXT:    sltu $a3, $s6, $s5
-; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    mulh.wu $a6, $a6, $a0
+; LA32-NEXT:    add.w $a6, $a6, $s4
+; LA32-NEXT:    mul.w $a0, $a5, $a0
+; LA32-NEXT:    add.w $a0, $a6, $a0
+; LA32-NEXT:    mul.w $a5, $t3, $t0
+; LA32-NEXT:    mulh.wu $a6, $t3, $a7
+; LA32-NEXT:    add.w $a5, $a6, $a5
+; LA32-NEXT:    add.w $a5, $a5, $s5
+; LA32-NEXT:    add.w $a0, $a5, $a0
+; LA32-NEXT:    sltu $a5, $s6, $s5
+; LA32-NEXT:    add.w $a0, $a0, $a5
 ; LA32-NEXT:    add.w $a0, $fp, $a0
 ; LA32-NEXT:    add.w $a0, $a0, $t8
-; LA32-NEXT:    add.w $a0, $a0, $t5
-; LA32-NEXT:    sltu $a3, $s8, $s7
-; LA32-NEXT:    add.w $a0, $a0, $a3
+; LA32-NEXT:    add.w $a0, $a0, $t4
+; LA32-NEXT:    sltu $a5, $s8, $s7
+; LA32-NEXT:    add.w $a0, $a0, $a5
 ; LA32-NEXT:    add.w $a0, $a0, $ra
-; LA32-NEXT:    srai.w $a3, $a1, 31
-; LA32-NEXT:    xor $a0, $a0, $a3
-; LA32-NEXT:    xor $a4, $t6, $a3
-; LA32-NEXT:    or $a0, $a4, $a0
-; LA32-NEXT:    xor $a4, $t7, $a3
-; LA32-NEXT:    xor $a3, $t0, $a3
-; LA32-NEXT:    or $a3, $a3, $a4
-; LA32-NEXT:    or $a0, $a3, $a0
-; LA32-NEXT:    sltu $t3, $zero, $a0
-; LA32-NEXT:    b .LBB1_17
-; LA32-NEXT:  .LBB1_11: # %overflow.no.lhs
-; LA32-NEXT:    xor $t2, $a7, $t1
-; LA32-NEXT:    xor $t1, $a3, $t1
-; LA32-NEXT:    or $t1, $t2, $t1
-; LA32-NEXT:    beq $t1, $zero, .LBB1_16
-; LA32-NEXT:  # %bb.12: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a4, .LBB1_18
-; LA32-NEXT:  # %bb.13: # %overflow.no.lhs.only
-; LA32-NEXT:    move $t1, $a0
-; LA32-NEXT:    move $t3, $a4
-; LA32-NEXT:    move $t2, $a6
-; LA32-NEXT:    move $t4, $t0
-; LA32-NEXT:    bgez $a4, .LBB1_19
-; LA32-NEXT:    b .LBB1_20
-; LA32-NEXT:  .LBB1_14: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a3, .LBB1_35
-; LA32-NEXT:  # %bb.15: # %overflow.no.rhs.only
-; LA32-NEXT:    move $t1, $a7
-; LA32-NEXT:    move $t3, $a3
-; LA32-NEXT:    move $t2, $a5
-; LA32-NEXT:    move $t4, $a1
-; LA32-NEXT:    bgez $a3, .LBB1_36
-; LA32-NEXT:    b .LBB1_37
-; LA32-NEXT:  .LBB1_16: # %overflow.no
-; LA32-NEXT:    move $t3, $zero
-; LA32-NEXT:    mulh.wu $t1, $a6, $a5
-; LA32-NEXT:    mul.w $t2, $t0, $a5
-; LA32-NEXT:    add.w $t1, $t2, $t1
-; LA32-NEXT:    sltu $t2, $t1, $t2
-; LA32-NEXT:    mulh.wu $t4, $t0, $a5
-; LA32-NEXT:    add.w $t4, $t4, $t2
-; LA32-NEXT:    mul.w $t2, $a6, $a1
-; LA32-NEXT:    add.w $t1, $t2, $t1
-; LA32-NEXT:    sltu $t2, $t1, $t2
-; LA32-NEXT:    mulh.wu $t5, $a6, $a1
-; LA32-NEXT:    add.w $t2, $t5, $t2
-; LA32-NEXT:    add.w $t5, $t4, $t2
-; LA32-NEXT:    mul.w $t6, $t0, $a1
-; LA32-NEXT:    add.w $t7, $t6, $t5
-; LA32-NEXT:    mul.w $t2, $a5, $a0
-; LA32-NEXT:    mul.w $t8, $a7, $a6
-; LA32-NEXT:    add.w $fp, $t8, $t2
-; LA32-NEXT:    add.w $t2, $t7, $fp
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $t7, $t2, $t7
-; LA32-NEXT:    sltu $t4, $t5, $t4
-; LA32-NEXT:    mulh.wu $t5, $t0, $a1
-; LA32-NEXT:    add.w $t4, $t5, $t4
-; LA32-NEXT:    add.w $t4, $t4, $t6
-; LA32-NEXT:    mul.w $t0, $a7, $t0
-; LA32-NEXT:    mulh.wu $a7, $a7, $a6
-; LA32-NEXT:    add.w $a7, $a7, $t0
-; LA32-NEXT:    mul.w $a3, $a3, $a6
-; LA32-NEXT:    add.w $a3, $a7, $a3
-; LA32-NEXT:    mulh.wu $a7, $a5, $a0
-; LA32-NEXT:    mul.w $a4, $a5, $a4
-; LA32-NEXT:    add.w $a4, $a7, $a4
-; LA32-NEXT:    mul.w $a0, $a1, $a0
-; LA32-NEXT:    add.w $a0, $a4, $a0
-; LA32-NEXT:    add.w $a0, $a3, $a0
-; LA32-NEXT:    sltu $a1, $fp, $t8
-; LA32-NEXT:    add.w $a0, $a0, $a1
-; LA32-NEXT:    add.w $a0, $t4, $a0
-; LA32-NEXT:    add.w $a1, $a0, $t7
-; LA32-NEXT:  .LBB1_17: # %overflow.res
-; LA32-NEXT:    mul.w $a0, $a6, $a5
-; LA32-NEXT:    b .LBB1_53
-; LA32-NEXT:  .LBB1_18:
-; LA32-NEXT:    sub.w $t2, $zero, $a0
-; LA32-NEXT:    or $t1, $a6, $t0
-; LA32-NEXT:    sltu $t3, $zero, $t1
-; LA32-NEXT:    sub.w $t1, $t2, $t3
-; LA32-NEXT:    sltu $t2, $t2, $t3
-; LA32-NEXT:    sltu $t3, $zero, $a0
-; LA32-NEXT:    add.w $t3, $a4, $t3
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    sub.w $t3, $zero, $t2
-; LA32-NEXT:    sub.w $t2, $zero, $a6
-; LA32-NEXT:    sltu $t4, $zero, $a6
-; LA32-NEXT:    add.w $t4, $t0, $t4
-; LA32-NEXT:    sub.w $t4, $zero, $t4
-; LA32-NEXT:    bltz $a4, .LBB1_20
-; LA32-NEXT:  .LBB1_19: # %overflow.no.lhs.only
-; LA32-NEXT:    move $t3, $a4
-; LA32-NEXT:    move $t1, $a0
-; LA32-NEXT:  .LBB1_20: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a4, .LBB1_24
-; LA32-NEXT:  # %bb.21: # %overflow.no.lhs.only
-; LA32-NEXT:    move $t4, $t0
-; LA32-NEXT:    bgez $a4, .LBB1_25
-; LA32-NEXT:  .LBB1_22: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a3, .LBB1_26
-; LA32-NEXT:  .LBB1_23: # %overflow.no.lhs.only
-; LA32-NEXT:    move $a0, $a7
-; LA32-NEXT:    move $a6, $a3
-; LA32-NEXT:    move $t0, $a5
-; LA32-NEXT:    move $t5, $a1
-; LA32-NEXT:    bgez $a3, .LBB1_27
-; LA32-NEXT:    b .LBB1_28
-; LA32-NEXT:  .LBB1_24: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a4, .LBB1_22
-; LA32-NEXT:  .LBB1_25: # %overflow.no.lhs.only
-; LA32-NEXT:    move $t2, $a6
-; LA32-NEXT:    bgez $a3, .LBB1_23
-; LA32-NEXT:  .LBB1_26:
-; LA32-NEXT:    sub.w $a6, $zero, $a7
-; LA32-NEXT:    or $a0, $a5, $a1
-; LA32-NEXT:    sltu $t0, $zero, $a0
-; LA32-NEXT:    sub.w $a0, $a6, $t0
-; LA32-NEXT:    sltu $a6, $a6, $t0
-; LA32-NEXT:    sltu $t0, $zero, $a7
-; LA32-NEXT:    add.w $t0, $a3, $t0
-; LA32-NEXT:    add.w $a6, $t0, $a6
-; LA32-NEXT:    sub.w $a6, $zero, $a6
-; LA32-NEXT:    sub.w $t0, $zero, $a5
-; LA32-NEXT:    sltu $t5, $zero, $a5
-; LA32-NEXT:    add.w $t5, $a1, $t5
-; LA32-NEXT:    sub.w $t5, $zero, $t5
-; LA32-NEXT:    bltz $a3, .LBB1_28
-; LA32-NEXT:  .LBB1_27: # %overflow.no.lhs.only
-; LA32-NEXT:    move $a6, $a3
-; LA32-NEXT:    move $a0, $a7
-; LA32-NEXT:  .LBB1_28: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a3, .LBB1_30
-; LA32-NEXT:  # %bb.29: # %overflow.no.lhs.only
-; LA32-NEXT:    move $t5, $a1
-; LA32-NEXT:    bgez $a3, .LBB1_31
-; LA32-NEXT:    b .LBB1_32
-; LA32-NEXT:  .LBB1_30: # %overflow.no.lhs.only
-; LA32-NEXT:    bltz $a3, .LBB1_32
-; LA32-NEXT:  .LBB1_31: # %overflow.no.lhs.only
-; LA32-NEXT:    move $t0, $a5
-; LA32-NEXT:  .LBB1_32: # %overflow.no.lhs.only
-; LA32-NEXT:    slti $a1, $a4, 0
-; LA32-NEXT:    slti $a3, $a3, 0
-; LA32-NEXT:    mulh.wu $a4, $t2, $t0
-; LA32-NEXT:    mul.w $a5, $t4, $t0
-; LA32-NEXT:    add.w $a4, $a5, $a4
-; LA32-NEXT:    sltu $a5, $a4, $a5
-; LA32-NEXT:    mulh.wu $a7, $t4, $t0
-; LA32-NEXT:    add.w $a5, $a7, $a5
-; LA32-NEXT:    mul.w $a7, $t2, $t5
-; LA32-NEXT:    add.w $a4, $a7, $a4
-; LA32-NEXT:    sltu $a7, $a4, $a7
-; LA32-NEXT:    mulh.wu $t6, $t2, $t5
-; LA32-NEXT:    add.w $a7, $t6, $a7
-; LA32-NEXT:    add.w $a7, $a5, $a7
-; LA32-NEXT:    mul.w $t6, $t4, $t5
-; LA32-NEXT:    add.w $t7, $t6, $a7
-; LA32-NEXT:    mul.w $t8, $t0, $t1
-; LA32-NEXT:    add.w $t8, $t7, $t8
-; LA32-NEXT:    sltu $fp, $t8, $t7
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $a5, $a7, $a5
-; LA32-NEXT:    mulh.wu $a7, $t4, $t5
-; LA32-NEXT:    add.w $a5, $a7, $a5
-; LA32-NEXT:    add.w $a5, $a5, $t6
-; LA32-NEXT:    mulh.wu $a7, $t0, $t1
-; LA32-NEXT:    mul.w $t6, $t0, $t3
-; LA32-NEXT:    add.w $a7, $a7, $t6
-; LA32-NEXT:    mul.w $t5, $t5, $t1
-; LA32-NEXT:    add.w $a7, $a7, $t5
-; LA32-NEXT:    add.w $a5, $a5, $a7
-; LA32-NEXT:    add.w $a7, $a5, $fp
-; LA32-NEXT:    mul.w $a5, $t2, $t0
-; LA32-NEXT:    mulh.wu $t0, $t2, $a0
-; LA32-NEXT:    mul.w $t5, $t4, $a0
-; LA32-NEXT:    add.w $t0, $t5, $t0
-; LA32-NEXT:    sltu $t5, $t0, $t5
-; LA32-NEXT:    mulh.wu $t6, $t4, $a0
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    mul.w $t6, $t2, $a6
-; LA32-NEXT:    add.w $t7, $t6, $t0
-; LA32-NEXT:    sltu $t0, $t7, $t6
-; LA32-NEXT:    mulh.wu $t6, $t2, $a6
-; LA32-NEXT:    add.w $t0, $t6, $t0
-; LA32-NEXT:    add.w $t6, $t5, $t0
-; LA32-NEXT:    mul.w $fp, $t4, $a6
-; LA32-NEXT:    add.w $s0, $fp, $t6
-; LA32-NEXT:    mul.w $t0, $a0, $t1
-; LA32-NEXT:    add.w $t0, $s0, $t0
-; LA32-NEXT:    sltu $s1, $t0, $s0
-; LA32-NEXT:    sltu $fp, $s0, $fp
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    mulh.wu $t4, $t4, $a6
-; LA32-NEXT:    add.w $t4, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t4, $fp
-; LA32-NEXT:    mulh.wu $t5, $a0, $t1
-; LA32-NEXT:    mul.w $t3, $a0, $t3
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    mul.w $a6, $a6, $t1
-; LA32-NEXT:    add.w $a6, $t3, $a6
-; LA32-NEXT:    add.w $t3, $t4, $a6
-; LA32-NEXT:    mul.w $a0, $t2, $a0
-; LA32-NEXT:    add.w $t2, $a7, $t7
-; LA32-NEXT:    add.w $a6, $t8, $a0
-; LA32-NEXT:    sltu $t1, $a6, $t8
-; LA32-NEXT:    add.w $t2, $t2, $t1
-; LA32-NEXT:    add.w $a0, $t3, $s1
-; LA32-NEXT:    beq $t2, $a7, .LBB1_34
-; LA32-NEXT:  # %bb.33: # %overflow.no.lhs.only
-; LA32-NEXT:    sltu $t1, $t2, $a7
-; LA32-NEXT:  .LBB1_34: # %overflow.no.lhs.only
-; LA32-NEXT:    add.w $a7, $t0, $t1
-; LA32-NEXT:    sltu $t0, $a7, $t0
-; LA32-NEXT:    add.w $t0, $a0, $t0
-; LA32-NEXT:    xor $a1, $a3, $a1
-; LA32-NEXT:    sub.w $a3, $zero, $a1
-; LA32-NEXT:    xor $a4, $a4, $a3
-; LA32-NEXT:    xor $a5, $a5, $a3
-; LA32-NEXT:    add.w $a0, $a5, $a1
-; LA32-NEXT:    sltu $a5, $a0, $a5
-; LA32-NEXT:    add.w $t1, $a4, $a5
-; LA32-NEXT:    sltui $a4, $t1, 1
-; LA32-NEXT:    sltu $a1, $a0, $a1
-; LA32-NEXT:    and $a4, $a4, $a1
-; LA32-NEXT:    xor $a1, $t2, $a3
-; LA32-NEXT:    xor $a5, $a6, $a3
-; LA32-NEXT:    add.w $t2, $a5, $a4
-; LA32-NEXT:    sltu $a5, $t2, $a5
-; LA32-NEXT:    add.w $a1, $a1, $a5
-; LA32-NEXT:    sltui $a5, $a1, 1
-; LA32-NEXT:    sltu $a4, $t2, $a4
-; LA32-NEXT:    and $a4, $a5, $a4
-; LA32-NEXT:    xor $a5, $t0, $a3
-; LA32-NEXT:    xor $a3, $a7, $a3
-; LA32-NEXT:    add.w $a4, $a3, $a4
-; LA32-NEXT:    sltu $a3, $a4, $a3
-; LA32-NEXT:    add.w $a3, $a5, $a3
-; LA32-NEXT:    or $a3, $a4, $a3
-; LA32-NEXT:    b .LBB1_52
-; LA32-NEXT:  .LBB1_35:
-; LA32-NEXT:    sub.w $t2, $zero, $a7
-; LA32-NEXT:    or $t1, $a5, $a1
-; LA32-NEXT:    sltu $t3, $zero, $t1
-; LA32-NEXT:    sub.w $t1, $t2, $t3
-; LA32-NEXT:    sltu $t2, $t2, $t3
-; LA32-NEXT:    sltu $t3, $zero, $a7
-; LA32-NEXT:    add.w $t3, $a3, $t3
-; LA32-NEXT:    add.w $t2, $t3, $t2
-; LA32-NEXT:    sub.w $t3, $zero, $t2
-; LA32-NEXT:    sub.w $t2, $zero, $a5
-; LA32-NEXT:    sltu $t4, $zero, $a5
-; LA32-NEXT:    add.w $t4, $a1, $t4
-; LA32-NEXT:    sub.w $t4, $zero, $t4
-; LA32-NEXT:    bltz $a3, .LBB1_37
-; LA32-NEXT:  .LBB1_36: # %overflow.no.rhs.only
-; LA32-NEXT:    move $t3, $a3
-; LA32-NEXT:    move $t1, $a7
-; LA32-NEXT:  .LBB1_37: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a3, .LBB1_41
-; LA32-NEXT:  # %bb.38: # %overflow.no.rhs.only
-; LA32-NEXT:    move $t4, $a1
-; LA32-NEXT:    bgez $a3, .LBB1_42
-; LA32-NEXT:  .LBB1_39: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a4, .LBB1_43
-; LA32-NEXT:  .LBB1_40: # %overflow.no.rhs.only
-; LA32-NEXT:    move $a1, $a0
-; LA32-NEXT:    move $a5, $a4
-; LA32-NEXT:    move $a7, $a6
-; LA32-NEXT:    move $t5, $t0
-; LA32-NEXT:    bgez $a4, .LBB1_44
-; LA32-NEXT:    b .LBB1_45
-; LA32-NEXT:  .LBB1_41: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a3, .LBB1_39
-; LA32-NEXT:  .LBB1_42: # %overflow.no.rhs.only
-; LA32-NEXT:    move $t2, $a5
-; LA32-NEXT:    bgez $a4, .LBB1_40
-; LA32-NEXT:  .LBB1_43:
-; LA32-NEXT:    sub.w $a5, $zero, $a0
-; LA32-NEXT:    or $a1, $a6, $t0
-; LA32-NEXT:    sltu $a7, $zero, $a1
-; LA32-NEXT:    sub.w $a1, $a5, $a7
-; LA32-NEXT:    sltu $a5, $a5, $a7
-; LA32-NEXT:    sltu $a7, $zero, $a0
-; LA32-NEXT:    add.w $a7, $a4, $a7
-; LA32-NEXT:    add.w $a5, $a7, $a5
-; LA32-NEXT:    sub.w $a5, $zero, $a5
-; LA32-NEXT:    sub.w $a7, $zero, $a6
-; LA32-NEXT:    sltu $t5, $zero, $a6
-; LA32-NEXT:    add.w $t5, $t0, $t5
-; LA32-NEXT:    sub.w $t5, $zero, $t5
-; LA32-NEXT:    bltz $a4, .LBB1_45
-; LA32-NEXT:  .LBB1_44: # %overflow.no.rhs.only
-; LA32-NEXT:    move $a5, $a4
-; LA32-NEXT:    move $a1, $a0
-; LA32-NEXT:  .LBB1_45: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a4, .LBB1_47
-; LA32-NEXT:  # %bb.46: # %overflow.no.rhs.only
-; LA32-NEXT:    move $t5, $t0
-; LA32-NEXT:    bgez $a4, .LBB1_48
-; LA32-NEXT:    b .LBB1_49
-; LA32-NEXT:  .LBB1_47: # %overflow.no.rhs.only
-; LA32-NEXT:    bltz $a4, .LBB1_49
-; LA32-NEXT:  .LBB1_48: # %overflow.no.rhs.only
-; LA32-NEXT:    move $a7, $a6
-; LA32-NEXT:  .LBB1_49: # %overflow.no.rhs.only
-; LA32-NEXT:    slti $a0, $a3, 0
-; LA32-NEXT:    slti $a3, $a4, 0
-; LA32-NEXT:    mulh.wu $a4, $t2, $a7
-; LA32-NEXT:    mul.w $a6, $t4, $a7
-; LA32-NEXT:    add.w $a4, $a6, $a4
-; LA32-NEXT:    sltu $a6, $a4, $a6
-; LA32-NEXT:    mulh.wu $t0, $t4, $a7
-; LA32-NEXT:    add.w $a6, $t0, $a6
-; LA32-NEXT:    mul.w $t0, $t2, $t5
-; LA32-NEXT:    add.w $a4, $t0, $a4
-; LA32-NEXT:    sltu $t0, $a4, $t0
-; LA32-NEXT:    mulh.wu $t6, $t2, $t5
-; LA32-NEXT:    add.w $t0, $t6, $t0
-; LA32-NEXT:    add.w $t0, $a6, $t0
-; LA32-NEXT:    mul.w $t6, $t4, $t5
-; LA32-NEXT:    add.w $t7, $t6, $t0
-; LA32-NEXT:    mul.w $t8, $a7, $t1
-; LA32-NEXT:    add.w $t8, $t7, $t8
-; LA32-NEXT:    sltu $fp, $t8, $t7
-; LA32-NEXT:    sltu $t6, $t7, $t6
-; LA32-NEXT:    sltu $a6, $t0, $a6
-; LA32-NEXT:    mulh.wu $t0, $t4, $t5
-; LA32-NEXT:    add.w $a6, $t0, $a6
-; LA32-NEXT:    add.w $a6, $a6, $t6
-; LA32-NEXT:    mulh.wu $t0, $a7, $t1
-; LA32-NEXT:    mul.w $t6, $a7, $t3
-; LA32-NEXT:    add.w $t0, $t0, $t6
-; LA32-NEXT:    mul.w $t5, $t5, $t1
-; LA32-NEXT:    add.w $t0, $t0, $t5
-; LA32-NEXT:    add.w $a6, $a6, $t0
-; LA32-NEXT:    add.w $t0, $a6, $fp
-; LA32-NEXT:    mul.w $a6, $t2, $a7
-; LA32-NEXT:    mulh.wu $a7, $t2, $a1
-; LA32-NEXT:    mul.w $t5, $t4, $a1
-; LA32-NEXT:    add.w $a7, $t5, $a7
-; LA32-NEXT:    sltu $t5, $a7, $t5
-; LA32-NEXT:    mulh.wu $t6, $t4, $a1
-; LA32-NEXT:    add.w $t5, $t6, $t5
-; LA32-NEXT:    mul.w $t6, $t2, $a5
-; LA32-NEXT:    add.w $t7, $t6, $a7
-; LA32-NEXT:    sltu $a7, $t7, $t6
-; LA32-NEXT:    mulh.wu $t6, $t2, $a5
-; LA32-NEXT:    add.w $a7, $t6, $a7
-; LA32-NEXT:    add.w $t6, $t5, $a7
-; LA32-NEXT:    mul.w $fp, $t4, $a5
-; LA32-NEXT:    add.w $s0, $fp, $t6
-; LA32-NEXT:    mul.w $a7, $a1, $t1
-; LA32-NEXT:    add.w $a7, $s0, $a7
-; LA32-NEXT:    sltu $s1, $a7, $s0
-; LA32-NEXT:    sltu $fp, $s0, $fp
-; LA32-NEXT:    sltu $t5, $t6, $t5
-; LA32-NEXT:    mulh.wu $t4, $t4, $a5
-; LA32-NEXT:    add.w $t4, $t4, $t5
-; LA32-NEXT:    add.w $t4, $t4, $fp
-; LA32-NEXT:    mulh.wu $t5, $a1, $t1
-; LA32-NEXT:    mul.w $t3, $a1, $t3
-; LA32-NEXT:    add.w $t3, $t5, $t3
-; LA32-NEXT:    mul.w $a5, $a5, $t1
-; LA32-NEXT:    add.w $a5, $t3, $a5
-; LA32-NEXT:    add.w $t1, $t4, $a5
-; LA32-NEXT:    mul.w $a1, $t2, $a1
-; LA32-NEXT:    add.w $a5, $t0, $t7
-; LA32-NEXT:    add.w $a1, $t8, $a1
-; LA32-NEXT:    sltu $t2, $a1, $t8
-; LA32-NEXT:    add.w $a5, $a5, $t2
-; LA32-NEXT:    add.w $t1, $t1, $s1
-; LA32-NEXT:    beq $a5, $t0, .LBB1_51
-; LA32-NEXT:  # %bb.50: # %overflow.no.rhs.only
-; LA32-NEXT:    sltu $t2, $a5, $t0
-; LA32-NEXT:  .LBB1_51: # %overflow.no.rhs.only
-; LA32-NEXT:    add.w $t0, $a7, $t2
-; LA32-NEXT:    sltu $a7, $t0, $a7
-; LA32-NEXT:    add.w $a7, $t1, $a7
-; LA32-NEXT:    xor $a3, $a0, $a3
-; LA32-NEXT:    sub.w $t3, $zero, $a3
-; LA32-NEXT:    xor $a4, $a4, $t3
-; LA32-NEXT:    xor $a6, $a6, $t3
-; LA32-NEXT:    add.w $a0, $a6, $a3
-; LA32-NEXT:    sltu $a6, $a0, $a6
-; LA32-NEXT:    add.w $t1, $a4, $a6
-; LA32-NEXT:    sltui $a4, $t1, 1
-; LA32-NEXT:    sltu $a3, $a0, $a3
-; LA32-NEXT:    and $a3, $a4, $a3
-; LA32-NEXT:    xor $a4, $a5, $t3
-; LA32-NEXT:    xor $a1, $a1, $t3
-; LA32-NEXT:    add.w $t2, $a1, $a3
-; LA32-NEXT:    sltu $a1, $t2, $a1
-; LA32-NEXT:    add.w $a1, $a4, $a1
-; LA32-NEXT:    sltui $a4, $a1, 1
-; LA32-NEXT:    sltu $a3, $t2, $a3
-; LA32-NEXT:    and $a3, $a4, $a3
-; LA32-NEXT:    xor $a4, $a7, $t3
-; LA32-NEXT:    xor $a5, $t0, $t3
-; LA32-NEXT:    add.w $a3, $a5, $a3
-; LA32-NEXT:    sltu $a5, $a3, $a5
-; LA32-NEXT:    add.w $a4, $a4, $a5
-; LA32-NEXT:    or $a3, $a3, $a4
-; LA32-NEXT:  .LBB1_52: # %overflow.res
-; LA32-NEXT:    sltu $t3, $zero, $a3
-; LA32-NEXT:  .LBB1_53: # %overflow.res
-; LA32-NEXT:    st.w $a0, $a2, 0
-; LA32-NEXT:    st.w $t1, $a2, 4
-; LA32-NEXT:    st.w $t2, $a2, 8
-; LA32-NEXT:    andi $a0, $t3, 1
-; LA32-NEXT:    st.w $a1, $a2, 12
+; LA32-NEXT:    srai.w $a5, $t2, 31
+; LA32-NEXT:    xor $a0, $a0, $a5
+; LA32-NEXT:    xor $a6, $t6, $a5
+; LA32-NEXT:    or $a0, $a6, $a0
+; LA32-NEXT:    xor $a6, $t7, $a5
+; LA32-NEXT:    xor $a5, $t5, $a5
+; LA32-NEXT:    or $a5, $a5, $a6
+; LA32-NEXT:    or $a0, $a5, $a0
+; LA32-NEXT:    sltu $a0, $zero, $a0
+; LA32-NEXT:    mul.w $a3, $a4, $a3
+; LA32-NEXT:    st.w $a3, $a2, 0
+; LA32-NEXT:    st.w $a1, $a2, 4
+; LA32-NEXT:    st.w $t1, $a2, 8
+; LA32-NEXT:    st.w $t2, $a2, 12
 ; LA32-NEXT:    ld.w $s8, $sp, 4 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s7, $sp, 8 # 4-byte Folded Reload
 ; LA32-NEXT:    ld.w $s6, $sp, 12 # 4-byte Folded Reload
@@ -838,13 +295,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: smuloi128:
-; LA64:       # %bb.0: # %overflow.entry
-; LA64-NEXT:    srai.d $a6, $a0, 63
-; LA64-NEXT:    srai.d $a5, $a2, 63
-; LA64-NEXT:    beq $a1, $a6, .LBB1_3
-; LA64-NEXT:  # %bb.1: # %overflow.lhs
-; LA64-NEXT:    beq $a3, $a5, .LBB1_5
-; LA64-NEXT:  # %bb.2: # %overflow
+; LA64:       # %bb.0:
 ; LA64-NEXT:    mulh.du $a5, $a0, $a2
 ; LA64-NEXT:    mul.d $a6, $a1, $a2
 ; LA64-NEXT:    add.d $a5, $a6, $a5
@@ -878,129 +329,11 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; LA64-NEXT:    xor $a1, $a1, $a6
 ; LA64-NEXT:    xor $a3, $a3, $a6
 ; LA64-NEXT:    or $a1, $a3, $a1
-; LA64-NEXT:    sltu $a6, $zero, $a1
-; LA64-NEXT:    b .LBB1_8
-; LA64-NEXT:  .LBB1_3: # %overflow.no.lhs
-; LA64-NEXT:    beq $a3, $a5, .LBB1_7
-; LA64-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; LA64-NEXT:    slti $a5, $a1, 0
-; LA64-NEXT:    masknez $a6, $a0, $a5
-; LA64-NEXT:    sub.d $a7, $zero, $a0
-; LA64-NEXT:    maskeqz $a7, $a7, $a5
-; LA64-NEXT:    or $a7, $a7, $a6
-; LA64-NEXT:    masknez $t0, $a1, $a5
-; LA64-NEXT:    sltu $a0, $zero, $a0
-; LA64-NEXT:    add.d $a0, $a1, $a0
-; LA64-NEXT:    sub.d $a0, $zero, $a0
-; LA64-NEXT:    maskeqz $a0, $a0, $a5
-; LA64-NEXT:    or $a0, $a0, $t0
-; LA64-NEXT:    maskeqz $a0, $a0, $a5
-; LA64-NEXT:    or $a0, $a0, $t0
-; LA64-NEXT:    maskeqz $a1, $a7, $a5
-; LA64-NEXT:    or $a1, $a1, $a6
-; LA64-NEXT:    slti $a6, $a3, 0
-; LA64-NEXT:    masknez $a7, $a2, $a6
-; LA64-NEXT:    sub.d $t0, $zero, $a2
-; LA64-NEXT:    maskeqz $t0, $t0, $a6
-; LA64-NEXT:    or $t0, $t0, $a7
-; LA64-NEXT:    masknez $t1, $a3, $a6
-; LA64-NEXT:    sltu $a2, $zero, $a2
-; LA64-NEXT:    add.d $a2, $a3, $a2
-; LA64-NEXT:    sub.d $a2, $zero, $a2
-; LA64-NEXT:    maskeqz $a2, $a2, $a6
-; LA64-NEXT:    or $a2, $a2, $t1
-; LA64-NEXT:    maskeqz $a2, $a2, $a6
-; LA64-NEXT:    or $a2, $a2, $t1
-; LA64-NEXT:    maskeqz $a3, $t0, $a6
-; LA64-NEXT:    or $a3, $a3, $a7
-; LA64-NEXT:    mulh.du $a7, $a1, $a3
-; LA64-NEXT:    mul.d $t0, $a0, $a3
-; LA64-NEXT:    add.d $a7, $a7, $t0
-; LA64-NEXT:    mul.d $a3, $a1, $a3
-; LA64-NEXT:    mul.d $a0, $a0, $a2
-; LA64-NEXT:    mulh.du $t0, $a1, $a2
-; LA64-NEXT:    add.d $a0, $t0, $a0
-; LA64-NEXT:    mul.d $a1, $a1, $a2
-; LA64-NEXT:    add.d $a1, $a7, $a1
-; LA64-NEXT:    sltu $a2, $a1, $a7
-; LA64-NEXT:    add.d $a2, $a0, $a2
-; LA64-NEXT:    xor $a5, $a6, $a5
-; LA64-NEXT:    sub.d $a6, $zero, $a5
-; LA64-NEXT:    xor $a0, $a3, $a6
-; LA64-NEXT:    add.d $a0, $a0, $a5
-; LA64-NEXT:    sltu $a3, $a0, $a5
-; LA64-NEXT:    xor $a1, $a1, $a6
-; LA64-NEXT:    add.d $a5, $a1, $a3
-; LA64-NEXT:    sltu $a1, $a5, $a3
-; LA64-NEXT:    b .LBB1_6
-; LA64-NEXT:  .LBB1_5: # %overflow.no.rhs.only
-; LA64-NEXT:    slti $a5, $a3, 0
-; LA64-NEXT:    masknez $a6, $a2, $a5
-; LA64-NEXT:    sub.d $a7, $zero, $a2
-; LA64-NEXT:    maskeqz $a7, $a7, $a5
-; LA64-NEXT:    or $a7, $a7, $a6
-; LA64-NEXT:    masknez $t0, $a3, $a5
-; LA64-NEXT:    sltu $a2, $zero, $a2
-; LA64-NEXT:    add.d $a2, $a3, $a2
-; LA64-NEXT:    sub.d $a2, $zero, $a2
-; LA64-NEXT:    maskeqz $a2, $a2, $a5
-; LA64-NEXT:    or $a2, $a2, $t0
-; LA64-NEXT:    maskeqz $a2, $a2, $a5
-; LA64-NEXT:    or $a2, $a2, $t0
-; LA64-NEXT:    maskeqz $a3, $a7, $a5
-; LA64-NEXT:    or $a3, $a3, $a6
-; LA64-NEXT:    slti $a6, $a1, 0
-; LA64-NEXT:    masknez $a7, $a0, $a6
-; LA64-NEXT:    sub.d $t0, $zero, $a0
-; LA64-NEXT:    maskeqz $t0, $t0, $a6
-; LA64-NEXT:    or $t0, $t0, $a7
-; LA64-NEXT:    masknez $t1, $a1, $a6
-; LA64-NEXT:    sltu $a0, $zero, $a0
-; LA64-NEXT:    add.d $a0, $a1, $a0
-; LA64-NEXT:    sub.d $a0, $zero, $a0
-; LA64-NEXT:    maskeqz $a0, $a0, $a6
-; LA64-NEXT:    or $a0, $a0, $t1
-; LA64-NEXT:    maskeqz $a0, $a0, $a6
-; LA64-NEXT:    or $a0, $a0, $t1
-; LA64-NEXT:    maskeqz $a1, $t0, $a6
-; LA64-NEXT:    or $a1, $a1, $a7
-; LA64-NEXT:    mulh.du $a7, $a3, $a1
-; LA64-NEXT:    mul.d $t0, $a2, $a1
-; LA64-NEXT:    add.d $a7, $a7, $t0
-; LA64-NEXT:    mul.d $a1, $a3, $a1
-; LA64-NEXT:    mul.d $a2, $a2, $a0
-; LA64-NEXT:    mulh.du $t0, $a3, $a0
-; LA64-NEXT:    add.d $a2, $t0, $a2
-; LA64-NEXT:    mul.d $a0, $a3, $a0
-; LA64-NEXT:    add.d $a3, $a7, $a0
-; LA64-NEXT:    sltu $a0, $a3, $a7
-; LA64-NEXT:    add.d $a2, $a2, $a0
-; LA64-NEXT:    xor $a5, $a5, $a6
-; LA64-NEXT:    sub.d $a6, $zero, $a5
-; LA64-NEXT:    xor $a0, $a1, $a6
-; LA64-NEXT:    add.d $a0, $a0, $a5
-; LA64-NEXT:    sltu $a1, $a0, $a5
-; LA64-NEXT:    xor $a3, $a3, $a6
-; LA64-NEXT:    add.d $a5, $a3, $a1
-; LA64-NEXT:    sltu $a1, $a5, $a1
-; LA64-NEXT:  .LBB1_6: # %overflow.res
-; LA64-NEXT:    xor $a2, $a2, $a6
-; LA64-NEXT:    add.d $a1, $a2, $a1
-; LA64-NEXT:    sltu $a6, $zero, $a1
-; LA64-NEXT:    b .LBB1_9
-; LA64-NEXT:  .LBB1_7: # %overflow.no
-; LA64-NEXT:    move $a6, $zero
-; LA64-NEXT:    mulh.du $a5, $a0, $a2
-; LA64-NEXT:    mul.d $a3, $a0, $a3
-; LA64-NEXT:    add.d $a3, $a5, $a3
-; LA64-NEXT:    mul.d $a1, $a1, $a2
-; LA64-NEXT:    add.d $a5, $a3, $a1
-; LA64-NEXT:  .LBB1_8: # %overflow.res
+; LA64-NEXT:    sltu $a1, $zero, $a1
 ; LA64-NEXT:    mul.d $a0, $a0, $a2
-; LA64-NEXT:  .LBB1_9: # %overflow.res
 ; LA64-NEXT:    st.d $a0, $a4, 0
-; LA64-NEXT:    andi $a0, $a6, 1
 ; LA64-NEXT:    st.d $a5, $a4, 8
+; LA64-NEXT:    move $a0, $a1
 ; LA64-NEXT:    ret
   %t = call {i128, i1} @llvm.smul.with.overflow.i128(i128 %v1, i128 %v2)
   %val = extractvalue {i128, i1} %t, 0
diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
index 5bebf54c3c1a0..f573fdab1b153 100644
--- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll
@@ -4,343 +4,136 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; PPC64-LABEL: muloti_test:
-; PPC64:       # %bb.0: # %overflow.entry
-; PPC64-NEXT:    cmpldi 3, 0
-; PPC64-NEXT:    beq 0, .LBB0_3
-; PPC64-NEXT:  # %bb.1: # %overflow.lhs
-; PPC64-NEXT:    cmpldi 5, 0
-; PPC64-NEXT:    beq 0, .LBB0_5
-; PPC64-NEXT:  # %bb.2: # %overflow
-; PPC64-NEXT:    mulhdu. 7, 3, 6
-; PPC64-NEXT:    mcrf 5, 0
-; PPC64-NEXT:    cmpdi 6, 5, 0
-; PPC64-NEXT:    mulhdu. 7, 5, 4
-; PPC64-NEXT:    mcrf 1, 0
-; PPC64-NEXT:    cmpdi 3, 0
-; PPC64-NEXT:    mulld 5, 5, 4
-; PPC64-NEXT:    mulld 3, 3, 6
-; PPC64-NEXT:    crnor 20, 26, 2
-; PPC64-NEXT:    add 3, 3, 5
-; PPC64-NEXT:    crorc 20, 20, 22
-; PPC64-NEXT:    mulhdu 7, 4, 6
-; PPC64-NEXT:    addc 3, 7, 3
-; PPC64-NEXT:    li 5, 0
-; PPC64-NEXT:    addze. 5, 5
-; PPC64-NEXT:    crorc 20, 20, 6
-; PPC64-NEXT:    crorc 20, 20, 2
+; PPC64:       # %bb.0: # %start
+; PPC64-NEXT:    addic 9, 5, -1
+; PPC64-NEXT:    mulld 10, 5, 4
+; PPC64-NEXT:    mulld 11, 3, 6
+; PPC64-NEXT:    subfe 9, 9, 5
+; PPC64-NEXT:    add 10, 11, 10
+; PPC64-NEXT:    addic 11, 3, -1
+; PPC64-NEXT:    mulhdu 8, 3, 6
+; PPC64-NEXT:    subfe 3, 11, 3
+; PPC64-NEXT:    and 3, 3, 9
+; PPC64-NEXT:    addic 9, 8, -1
+; PPC64-NEXT:    subfe 8, 9, 8
+; PPC64-NEXT:    or 3, 3, 8
+; PPC64-NEXT:    mulhdu 5, 5, 4
+; PPC64-NEXT:    addic 8, 5, -1
+; PPC64-NEXT:    subfe 5, 8, 5
+; PPC64-NEXT:    li 7, 0
+; PPC64-NEXT:    or 5, 3, 5
+; PPC64-NEXT:    mulhdu 8, 4, 6
+; PPC64-NEXT:    addc 3, 8, 10
+; PPC64-NEXT:    addze 7, 7
+; PPC64-NEXT:    addic 8, 7, -1
+; PPC64-NEXT:    subfe 7, 8, 7
+; PPC64-NEXT:    or 5, 5, 7
 ; PPC64-NEXT:    mulld 4, 4, 6
-; PPC64-NEXT:    b .LBB0_7
-; PPC64-NEXT:  .LBB0_3: # %overflow.no.lhs
-; PPC64-NEXT:    cmpldi 5, 0
-; PPC64-NEXT:    beq 0, .LBB0_6
-; PPC64-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; PPC64-NEXT:    mulhdu 7, 4, 6
-; PPC64-NEXT:    mulld 8, 3, 6
-; PPC64-NEXT:    mulld 9, 3, 5
-; PPC64-NEXT:    add 3, 7, 8
-; PPC64-NEXT:    mulhdu 7, 4, 5
-; PPC64-NEXT:    mulld 5, 4, 5
-; PPC64-NEXT:    mulld 4, 4, 6
-; PPC64-NEXT:    addc 3, 3, 5
-; PPC64-NEXT:    adde. 5, 7, 9
-; PPC64-NEXT:    crnot 20, 2
-; PPC64-NEXT:    b .LBB0_7
-; PPC64-NEXT:  .LBB0_5: # %overflow.no.rhs.only
-; PPC64-NEXT:    mulhdu 7, 6, 4
-; PPC64-NEXT:    mulld 8, 5, 4
-; PPC64-NEXT:    mulld 5, 5, 3
-; PPC64-NEXT:    mulld 4, 6, 4
-; PPC64-NEXT:    add 7, 7, 8
-; PPC64-NEXT:    mulhdu 8, 6, 3
-; PPC64-NEXT:    mulld 3, 6, 3
-; PPC64-NEXT:    addc 3, 7, 3
-; PPC64-NEXT:    adde. 5, 8, 5
-; PPC64-NEXT:    crnot 20, 2
-; PPC64-NEXT:    b .LBB0_7
-; PPC64-NEXT:  .LBB0_6: # %overflow.no
-; PPC64-NEXT:    mulld 5, 4, 5
-; PPC64-NEXT:    mulhdu 7, 4, 6
-; PPC64-NEXT:    mulld 3, 3, 6
-; PPC64-NEXT:    add 5, 7, 5
-; PPC64-NEXT:    mulld 4, 4, 6
-; PPC64-NEXT:    add 3, 5, 3
-; PPC64-NEXT:    crxor 20, 20, 20
-; PPC64-NEXT:  .LBB0_7: # %overflow.res
-; PPC64-NEXT:    li 5, 1
-; PPC64-NEXT:    bclr 12, 20, 0
-; PPC64-NEXT:  # %bb.8: # %overflow.res
-; PPC64-NEXT:    li 5, 0
 ; PPC64-NEXT:    blr
 ;
 ; PPC32-LABEL: muloti_test:
-; PPC32:       # %bb.0: # %overflow.entry
-; PPC32-NEXT:    stwu 1, -80(1)
-; PPC32-NEXT:    stw 30, 72(1) # 4-byte Folded Spill
+; PPC32:       # %bb.0: # %start
+; PPC32-NEXT:    stwu 1, -64(1)
+; PPC32-NEXT:    stw 26, 40(1) # 4-byte Folded Spill
 ; PPC32-NEXT:    mfcr 12
-; PPC32-NEXT:    or. 30, 4, 3
-; PPC32-NEXT:    stw 18, 24(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 19, 28(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 20, 32(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 21, 36(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 22, 40(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 23, 44(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 24, 48(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 25, 52(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 26, 56(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 27, 60(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 28, 64(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 29, 68(1) # 4-byte Folded Spill
-; PPC32-NEXT:    stw 12, 20(1)
-; PPC32-NEXT:    beq 0, .LBB0_3
-; PPC32-NEXT:  # %bb.1: # %overflow.lhs
-; PPC32-NEXT:    or. 29, 8, 7
-; PPC32-NEXT:    beq 0, .LBB0_5
-; PPC32-NEXT:  # %bb.2: # %overflow
-; PPC32-NEXT:    mullw 28, 9, 4
-; PPC32-NEXT:    li 19, 0
-; PPC32-NEXT:    cmpwi 2, 7, 0
-; PPC32-NEXT:    cmpwi 3, 5, 0
-; PPC32-NEXT:    cmpwi 7, 3, 0
-; PPC32-NEXT:    mullw 27, 3, 10
-; PPC32-NEXT:    add 28, 27, 28
-; PPC32-NEXT:    mulhwu 11, 4, 10
-; PPC32-NEXT:    addc 11, 11, 28
-; PPC32-NEXT:    addze 28, 19
-; PPC32-NEXT:    mullw 24, 5, 8
-; PPC32-NEXT:    mullw 23, 7, 6
-; PPC32-NEXT:    add 27, 23, 24
-; PPC32-NEXT:    mulhwu 12, 8, 6
-; PPC32-NEXT:    addc 12, 12, 27
-; PPC32-NEXT:    addze 27, 19
-; PPC32-NEXT:    mullw 22, 8, 6
-; PPC32-NEXT:    mullw 21, 4, 10
-; PPC32-NEXT:    addc 23, 21, 22
-; PPC32-NEXT:    adde 21, 11, 12
-; PPC32-NEXT:    mulhwu 26, 6, 10
-; PPC32-NEXT:    mullw 20, 5, 10
-; PPC32-NEXT:    addc 11, 20, 26
-; PPC32-NEXT:    mulhwu 0, 5, 10
-; PPC32-NEXT:    addze 12, 0
-; PPC32-NEXT:    mullw 22, 6, 9
-; PPC32-NEXT:    addc 11, 22, 11
-; PPC32-NEXT:    mulhwu 25, 6, 9
-; PPC32-NEXT:    addze 26, 25
-; PPC32-NEXT:    addc 12, 12, 26
-; PPC32-NEXT:    addze 26, 19
-; PPC32-NEXT:    mullw 0, 5, 9
-; PPC32-NEXT:    addc 12, 0, 12
-; PPC32-NEXT:    mulhwu 24, 5, 9
-; PPC32-NEXT:    adde 0, 24, 26
-; PPC32-NEXT:    addc 12, 12, 23
-; PPC32-NEXT:    adde 0, 0, 21
-; PPC32-NEXT:    addze. 26, 19
+; PPC32-NEXT:    stw 27, 44(1) # 4-byte Folded Spill
+; PPC32-NEXT:    mullw 27, 9, 4
+; PPC32-NEXT:    stw 21, 20(1) # 4-byte Folded Spill
+; PPC32-NEXT:    mr 11, 7
+; PPC32-NEXT:    stw 22, 24(1) # 4-byte Folded Spill
+; PPC32-NEXT:    li 7, 0
+; PPC32-NEXT:    mullw 26, 3, 10
+; PPC32-NEXT:    stw 23, 28(1) # 4-byte Folded Spill
+; PPC32-NEXT:    add 27, 26, 27
+; PPC32-NEXT:    stw 24, 32(1) # 4-byte Folded Spill
+; PPC32-NEXT:    cmpwi 7, 11, 0
+; PPC32-NEXT:    stw 25, 36(1) # 4-byte Folded Spill
+; PPC32-NEXT:    mullw 24, 11, 6
+; PPC32-NEXT:    stw 28, 48(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 29, 52(1) # 4-byte Folded Spill
+; PPC32-NEXT:    stw 30, 56(1) # 4-byte Folded Spill
+; PPC32-NEXT:    mulhwu 0, 8, 6
+; PPC32-NEXT:    stw 12, 16(1)
+; PPC32-NEXT:    mr 12, 5
+; PPC32-NEXT:    mulhwu 5, 4, 10
+; PPC32-NEXT:    addc 5, 5, 27
+; PPC32-NEXT:    addze 27, 7
+; PPC32-NEXT:    cmpwi 2, 27, 0
+; PPC32-NEXT:    mullw 25, 12, 8
+; PPC32-NEXT:    add 26, 24, 25
+; PPC32-NEXT:    addc 0, 0, 26
+; PPC32-NEXT:    addze 26, 7
+; PPC32-NEXT:    mullw 23, 8, 6
+; PPC32-NEXT:    mullw 22, 4, 10
+; PPC32-NEXT:    addc 24, 22, 23
+; PPC32-NEXT:    adde 22, 5, 0
+; PPC32-NEXT:    mulhwu 29, 6, 10
+; PPC32-NEXT:    mullw 21, 12, 10
+; PPC32-NEXT:    addc 5, 21, 29
+; PPC32-NEXT:    mulhwu 30, 12, 10
+; PPC32-NEXT:    addze 0, 30
+; PPC32-NEXT:    mullw 23, 6, 9
+; PPC32-NEXT:    addc 5, 23, 5
+; PPC32-NEXT:    mulhwu 28, 6, 9
+; PPC32-NEXT:    addze 29, 28
+; PPC32-NEXT:    addc 0, 0, 29
+; PPC32-NEXT:    addze 29, 7
+; PPC32-NEXT:    mullw 30, 12, 9
+; PPC32-NEXT:    addc 0, 30, 0
+; PPC32-NEXT:    mulhwu 25, 12, 9
+; PPC32-NEXT:    adde 30, 25, 29
+; PPC32-NEXT:    addc 0, 0, 24
+; PPC32-NEXT:    adde 30, 30, 22
+; PPC32-NEXT:    addze. 29, 7
 ; PPC32-NEXT:    mcrf 1, 0
-; PPC32-NEXT:    mulhwu. 26, 7, 6
-; PPC32-NEXT:    mcrf 5, 0
-; PPC32-NEXT:    crnor 20, 14, 10
-; PPC32-NEXT:    crorc 20, 20, 22
-; PPC32-NEXT:    cmpwi 2, 30, 0
-; PPC32-NEXT:    cmpwi 3, 29, 0
-; PPC32-NEXT:    mulhwu. 5, 5, 8
+; PPC32-NEXT:    mulhwu. 29, 11, 6
 ; PPC32-NEXT:    mcrf 6, 0
-; PPC32-NEXT:    cmpwi 9, 0
-; PPC32-NEXT:    crnor 21, 2, 30
+; PPC32-NEXT:    mulhwu. 29, 12, 8
+; PPC32-NEXT:    mcrf 5, 0
+; PPC32-NEXT:    cmpwi 12, 0
+; PPC32-NEXT:    crnor 20, 2, 30
+; PPC32-NEXT:    cmpwi 3, 0
+; PPC32-NEXT:    cmpwi 7, 9, 0
+; PPC32-NEXT:    crnor 24, 30, 2
+; PPC32-NEXT:    mulhwu. 12, 3, 10
 ; PPC32-NEXT:    crorc 20, 20, 26
-; PPC32-NEXT:    crnor 23, 14, 10
-; PPC32-NEXT:    mulhwu. 3, 3, 10
 ; PPC32-NEXT:    mcrf 7, 0
-; PPC32-NEXT:    cmpwi 27, 0
-; PPC32-NEXT:    crorc 20, 20, 2
-; PPC32-NEXT:    crorc 21, 21, 30
-; PPC32-NEXT:    mulhwu. 3, 9, 4
-; PPC32-NEXT:    crorc 21, 21, 2
-; PPC32-NEXT:    cmpwi 28, 0
-; PPC32-NEXT:    crorc 21, 21, 2
-; PPC32-NEXT:    cror 21, 23, 21
-; PPC32-NEXT:    cror 20, 21, 20
-; PPC32-NEXT:    crorc 20, 20, 6
-; PPC32-NEXT:    mullw 6, 6, 10
-; PPC32-NEXT:    li 7, 1
-; PPC32-NEXT:    bc 4, 20, .LBB0_7
-; PPC32-NEXT:    b .LBB0_8
-; PPC32-NEXT:  .LBB0_3: # %overflow.no.lhs
-; PPC32-NEXT:    or. 11, 8, 7
-; PPC32-NEXT:    beq 0, .LBB0_9
-; PPC32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; PPC32-NEXT:    mulhwu 29, 10, 4
-; PPC32-NEXT:    mullw 20, 10, 3
-; PPC32-NEXT:    add 29, 29, 20
-; PPC32-NEXT:    mulhwu 12, 6, 10
-; PPC32-NEXT:    mulhwu 0, 6, 9
-; PPC32-NEXT:    mulhwu 30, 5, 9
-; PPC32-NEXT:    mulhwu 24, 8, 4
-; PPC32-NEXT:    mullw 23, 5, 10
-; PPC32-NEXT:    addc 12, 23, 12
-; PPC32-NEXT:    mullw 22, 6, 9
-; PPC32-NEXT:    mullw 21, 5, 9
-; PPC32-NEXT:    mullw 9, 9, 4
-; PPC32-NEXT:    add 9, 29, 9
-; PPC32-NEXT:    mullw 3, 8, 3
-; PPC32-NEXT:    add 3, 24, 3
-; PPC32-NEXT:    mulhwu 11, 5, 10
-; PPC32-NEXT:    mullw 29, 7, 4
-; PPC32-NEXT:    add 3, 3, 29
-; PPC32-NEXT:    addze 29, 11
-; PPC32-NEXT:    addc 11, 22, 12
-; PPC32-NEXT:    addze 0, 0
-; PPC32-NEXT:    li 12, 0
-; PPC32-NEXT:    addc 0, 29, 0
-; PPC32-NEXT:    addze 29, 12
-; PPC32-NEXT:    addc 0, 21, 0
-; PPC32-NEXT:    mullw 19, 10, 4
-; PPC32-NEXT:    adde 30, 30, 29
-; PPC32-NEXT:    addc 0, 0, 19
-; PPC32-NEXT:    adde 9, 30, 9
-; PPC32-NEXT:    mulhwu 27, 6, 8
-; PPC32-NEXT:    mullw 18, 5, 8
-; PPC32-NEXT:    addc 30, 18, 27
-; PPC32-NEXT:    mulhwu 28, 5, 8
-; PPC32-NEXT:    addze 29, 28
-; PPC32-NEXT:    mulhwu 26, 6, 7
-; PPC32-NEXT:    mulhwu 25, 5, 7
-; PPC32-NEXT:    mullw 5, 5, 7
-; PPC32-NEXT:    mullw 7, 6, 7
-; PPC32-NEXT:    addc 7, 7, 30
-; PPC32-NEXT:    addze 30, 26
-; PPC32-NEXT:    addc 30, 29, 30
-; PPC32-NEXT:    addze 12, 12
-; PPC32-NEXT:    addc 5, 5, 30
-; PPC32-NEXT:    mullw 4, 8, 4
-; PPC32-NEXT:    adde 12, 25, 12
-; PPC32-NEXT:    addc 4, 5, 4
-; PPC32-NEXT:    adde 3, 12, 3
-; PPC32-NEXT:    mullw 5, 6, 8
-; PPC32-NEXT:    addc 12, 0, 5
-; PPC32-NEXT:    adde 0, 9, 7
-; PPC32-NEXT:    addze 4, 4
-; PPC32-NEXT:    addze 3, 3
+; PPC32-NEXT:    crorc 20, 20, 22
+; PPC32-NEXT:    cmpwi 26, 0
+; PPC32-NEXT:    crorc 28, 20, 2
+; PPC32-NEXT:    mulhwu. 9, 9, 4
+; PPC32-NEXT:    mcrf 5, 0
+; PPC32-NEXT:    crorc 20, 24, 30
 ; PPC32-NEXT:    or. 3, 4, 3
+; PPC32-NEXT:    mcrf 6, 0
+; PPC32-NEXT:    crorc 20, 20, 22
+; PPC32-NEXT:    or. 3, 8, 11
+; PPC32-NEXT:    crorc 20, 20, 10
+; PPC32-NEXT:    crnor 21, 2, 26
+; PPC32-NEXT:    cror 20, 21, 20
+; PPC32-NEXT:    cror 20, 20, 28
+; PPC32-NEXT:    crandc 20, 6, 20
 ; PPC32-NEXT:    mullw 6, 6, 10
-; PPC32-NEXT:    b .LBB0_6
-; PPC32-NEXT:  .LBB0_5: # %overflow.no.rhs.only
-; PPC32-NEXT:    mulhwu 29, 6, 8
-; PPC32-NEXT:    mullw 20, 6, 7
-; PPC32-NEXT:    add 29, 29, 20
-; PPC32-NEXT:    mulhwu 12, 10, 6
-; PPC32-NEXT:    mulhwu 0, 10, 5
-; PPC32-NEXT:    mulhwu 30, 9, 5
-; PPC32-NEXT:    mulhwu 24, 4, 8
-; PPC32-NEXT:    mullw 23, 9, 6
-; PPC32-NEXT:    addc 12, 23, 12
-; PPC32-NEXT:    mullw 22, 10, 5
-; PPC32-NEXT:    mullw 21, 9, 5
-; PPC32-NEXT:    mullw 5, 5, 8
-; PPC32-NEXT:    add 5, 29, 5
-; PPC32-NEXT:    mullw 7, 4, 7
-; PPC32-NEXT:    add 7, 24, 7
-; PPC32-NEXT:    mulhwu 11, 9, 6
-; PPC32-NEXT:    mullw 29, 3, 8
-; PPC32-NEXT:    add 7, 7, 29
-; PPC32-NEXT:    addze 29, 11
-; PPC32-NEXT:    addc 11, 22, 12
-; PPC32-NEXT:    addze 0, 0
-; PPC32-NEXT:    li 12, 0
-; PPC32-NEXT:    addc 0, 29, 0
-; PPC32-NEXT:    addze 29, 12
-; PPC32-NEXT:    addc 0, 21, 0
-; PPC32-NEXT:    mullw 19, 6, 8
-; PPC32-NEXT:    adde 30, 30, 29
-; PPC32-NEXT:    addc 0, 0, 19
-; PPC32-NEXT:    adde 5, 30, 5
-; PPC32-NEXT:    mulhwu 27, 10, 4
-; PPC32-NEXT:    mullw 18, 9, 4
-; PPC32-NEXT:    addc 30, 18, 27
-; PPC32-NEXT:    mulhwu 28, 9, 4
-; PPC32-NEXT:    addze 29, 28
-; PPC32-NEXT:    mulhwu 26, 10, 3
-; PPC32-NEXT:    mulhwu 25, 9, 3
-; PPC32-NEXT:    mullw 9, 9, 3
-; PPC32-NEXT:    mullw 3, 10, 3
-; PPC32-NEXT:    addc 3, 3, 30
-; PPC32-NEXT:    addze 30, 26
-; PPC32-NEXT:    addc 30, 29, 30
-; PPC32-NEXT:    addze 12, 12
-; PPC32-NEXT:    addc 9, 9, 30
-; PPC32-NEXT:    mullw 8, 4, 8
-; PPC32-NEXT:    adde 12, 25, 12
-; PPC32-NEXT:    addc 8, 9, 8
-; PPC32-NEXT:    adde 7, 12, 7
-; PPC32-NEXT:    mullw 4, 10, 4
-; PPC32-NEXT:    addc 12, 0, 4
-; PPC32-NEXT:    adde 0, 5, 3
-; PPC32-NEXT:    addze 3, 8
-; PPC32-NEXT:    addze 4, 7
-; PPC32-NEXT:    or. 3, 3, 4
-; PPC32-NEXT:    mullw 6, 10, 6
-; PPC32-NEXT:  .LBB0_6: # %overflow.no.rhs.only
-; PPC32-NEXT:    crnot 20, 2
+; PPC32-NEXT:    bc 12, 20, .LBB0_2
+; PPC32-NEXT:  # %bb.1: # %start
 ; PPC32-NEXT:    li 7, 1
-; PPC32-NEXT:    bc 12, 20, .LBB0_8
-; PPC32-NEXT:  .LBB0_7: # %overflow.res
-; PPC32-NEXT:    li 7, 0
-; PPC32-NEXT:  .LBB0_8: # %overflow.res
-; PPC32-NEXT:    mr 4, 12
-; PPC32-NEXT:    lwz 12, 20(1)
-; PPC32-NEXT:    mr 3, 0
-; PPC32-NEXT:    mr 5, 11
-; PPC32-NEXT:    lwz 30, 72(1) # 4-byte Folded Reload
+; PPC32-NEXT:  .LBB0_2: # %start
+; PPC32-NEXT:    lwz 12, 16(1)
+; PPC32-NEXT:    mr 3, 30
+; PPC32-NEXT:    mr 4, 0
+; PPC32-NEXT:    lwz 30, 56(1) # 4-byte Folded Reload
 ; PPC32-NEXT:    mtcrf 32, 12 # cr2
-; PPC32-NEXT:    mtcrf 16, 12 # cr3
-; PPC32-NEXT:    lwz 29, 68(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 28, 64(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 27, 60(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 26, 56(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 25, 52(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 24, 48(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 23, 44(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 22, 40(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 21, 36(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 20, 32(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 19, 28(1) # 4-byte Folded Reload
-; PPC32-NEXT:    lwz 18, 24(1) # 4-byte Folded Reload
-; PPC32-NEXT:    addi 1, 1, 80
+; PPC32-NEXT:    lwz 29, 52(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 28, 48(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 27, 44(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 26, 40(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 25, 36(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 24, 32(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 23, 28(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 22, 24(1) # 4-byte Folded Reload
+; PPC32-NEXT:    lwz 21, 20(1) # 4-byte Folded Reload
+; PPC32-NEXT:    addi 1, 1, 64
 ; PPC32-NEXT:    blr
-; PPC32-NEXT:  .LBB0_9: # %overflow.no
-; PPC32-NEXT:    mulhwu 11, 10, 4
-; PPC32-NEXT:    mulhwu 12, 8, 6
-; PPC32-NEXT:    mullw 3, 10, 3
-; PPC32-NEXT:    add 3, 11, 3
-; PPC32-NEXT:    mullw 26, 8, 5
-; PPC32-NEXT:    mulhwu 0, 5, 10
-; PPC32-NEXT:    mulhwu 30, 6, 10
-; PPC32-NEXT:    mulhwu 29, 6, 9
-; PPC32-NEXT:    mulhwu 28, 5, 9
-; PPC32-NEXT:    mullw 27, 9, 4
-; PPC32-NEXT:    add 3, 3, 27
-; PPC32-NEXT:    mullw 7, 7, 6
-; PPC32-NEXT:    mullw 4, 10, 4
-; PPC32-NEXT:    mullw 8, 8, 6
-; PPC32-NEXT:    addc 4, 8, 4
-; PPC32-NEXT:    li 8, 0
-; PPC32-NEXT:    mullw 25, 5, 10
-; PPC32-NEXT:    mullw 5, 5, 9
-; PPC32-NEXT:    mullw 9, 6, 9
-; PPC32-NEXT:    mullw 6, 6, 10
-; PPC32-NEXT:    add 10, 12, 26
-; PPC32-NEXT:    add 7, 10, 7
-; PPC32-NEXT:    adde 3, 7, 3
-; PPC32-NEXT:    addc 7, 25, 30
-; PPC32-NEXT:    addze 10, 0
-; PPC32-NEXT:    addc 11, 9, 7
-; PPC32-NEXT:    addze 7, 29
-; PPC32-NEXT:    addc 7, 10, 7
-; PPC32-NEXT:    addze 8, 8
-; PPC32-NEXT:    addc 5, 5, 7
-; PPC32-NEXT:    adde 7, 28, 8
-; PPC32-NEXT:    addc 12, 5, 4
-; PPC32-NEXT:    adde 0, 7, 3
-; PPC32-NEXT:    li 7, 1
-; PPC32-NEXT:    b .LBB0_7
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
index 4c9aeaa3ba5a1..d6fd4f15c4e53 100644
--- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll
@@ -3,7 +3,7 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-LABEL: muloti_test:
-; RISCV32:       # %bb.0: # %overflow.entry
+; RISCV32:       # %bb.0: # %start
 ; RISCV32-NEXT:    addi sp, sp, -32
 ; RISCV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
@@ -11,301 +11,100 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) #0 {
 ; RISCV32-NEXT:    sw s3, 16(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s4, 12(sp) # 4-byte Folded Spill
 ; RISCV32-NEXT:    sw s5, 8(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    sw s6, 4(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    sw s7, 0(sp) # 4-byte Folded Spill
-; RISCV32-NEXT:    lw a3, 0(a1)
+; RISCV32-NEXT:    lw a4, 0(a1)
 ; RISCV32-NEXT:    lw t0, 4(a1)
-; RISCV32-NEXT:    lw a4, 8(a1)
-; RISCV32-NEXT:    lw a6, 12(a1)
-; RISCV32-NEXT:    lw a1, 0(a2)
-; RISCV32-NEXT:    lw a7, 4(a2)
-; RISCV32-NEXT:    lw a5, 8(a2)
+; RISCV32-NEXT:    lw a3, 8(a1)
+; RISCV32-NEXT:    lw a1, 12(a1)
+; RISCV32-NEXT:    lw a6, 0(a2)
+; RISCV32-NEXT:    lw a5, 4(a2)
+; RISCV32-NEXT:    lw a7, 8(a2)
 ; RISCV32-NEXT:    lw a2, 12(a2)
-; RISCV32-NEXT:    or t4, a4, a6
-; RISCV32-NEXT:    beqz t4, .LBB0_5
-; RISCV32-NEXT:  # %bb.1: # %overflow.lhs
-; RISCV32-NEXT:    or t5, a5, a2
-; RISCV32-NEXT:    beqz t5, .LBB0_9
-; RISCV32-NEXT:  # %bb.2: # %overflow
-; RISCV32-NEXT:    mulhu t1, a3, a1
-; RISCV32-NEXT:    mul t2, t0, a1
-; RISCV32-NEXT:    mulhu t3, t0, a1
-; RISCV32-NEXT:    mul t6, a3, a7
-; RISCV32-NEXT:    mulhu s0, a3, a7
-; RISCV32-NEXT:    mul s4, t0, a7
-; RISCV32-NEXT:    mul s1, a5, a3
-; RISCV32-NEXT:    mul s5, a4, a1
+; RISCV32-NEXT:    mulhu t1, a4, a6
+; RISCV32-NEXT:    mul t2, t0, a6
+; RISCV32-NEXT:    mulhu t3, t0, a6
+; RISCV32-NEXT:    mul t4, a4, a5
+; RISCV32-NEXT:    mulhu t5, a4, a5
 ; RISCV32-NEXT:    mul s2, t0, a5
-; RISCV32-NEXT:    mul s3, a2, a3
-; RISCV32-NEXT:    mul s6, a7, a4
-; RISCV32-NEXT:    add s3, s3, s2
-; RISCV32-NEXT:    mul s2, a6, a1
-; RISCV32-NEXT:    add s6, s2, s6
-; RISCV32-NEXT:    mulhu s7, t0, a7
+; RISCV32-NEXT:    mul t6, a7, a4
+; RISCV32-NEXT:    mul s3, a3, a6
+; RISCV32-NEXT:    mul s0, t0, a7
+; RISCV32-NEXT:    mul s1, a2, a4
+; RISCV32-NEXT:    mul s4, a5, a3
+; RISCV32-NEXT:    add s1, s1, s0
+; RISCV32-NEXT:    mul s0, a1, a6
+; RISCV32-NEXT:    add s4, s0, s4
+; RISCV32-NEXT:    mulhu s5, t0, a5
 ; RISCV32-NEXT:    add t1, t2, t1
 ; RISCV32-NEXT:    sltu t2, t1, t2
 ; RISCV32-NEXT:    add t2, t3, t2
-; RISCV32-NEXT:    mulhu s2, a5, a3
-; RISCV32-NEXT:    add t1, t6, t1
-; RISCV32-NEXT:    sltu t3, t1, t6
-; RISCV32-NEXT:    add t3, s0, t3
-; RISCV32-NEXT:    mulhu s0, a4, a1
-; RISCV32-NEXT:    add t6, s5, s1
-; RISCV32-NEXT:    add s3, s2, s3
-; RISCV32-NEXT:    add s1, s0, s6
-; RISCV32-NEXT:    sltu s5, t6, s5
-; RISCV32-NEXT:    add t3, t2, t3
-; RISCV32-NEXT:    sltu t2, t3, t2
-; RISCV32-NEXT:    add s7, s7, t2
-; RISCV32-NEXT:    add s6, s1, s3
-; RISCV32-NEXT:    add t3, s4, t3
-; RISCV32-NEXT:    add t2, t3, t6
-; RISCV32-NEXT:    sltu s4, t3, s4
-; RISCV32-NEXT:    sltu t6, t2, t3
-; RISCV32-NEXT:    add s4, s7, s4
-; RISCV32-NEXT:    add s5, s6, s5
-; RISCV32-NEXT:    add t3, s4, s5
-; RISCV32-NEXT:    add t3, t3, t6
-; RISCV32-NEXT:    beq t3, s4, .LBB0_4
-; RISCV32-NEXT:  # %bb.3: # %overflow
-; RISCV32-NEXT:    sltu t6, t3, s4
-; RISCV32-NEXT:  .LBB0_4: # %overflow
-; RISCV32-NEXT:    sltu s2, s3, s2
-; RISCV32-NEXT:    snez s3, t0
-; RISCV32-NEXT:    snez s4, a2
-; RISCV32-NEXT:    mulhu a2, a2, a3
-; RISCV32-NEXT:    mulhu a5, t0, a5
-; RISCV32-NEXT:    sltu t0, s1, s0
-; RISCV32-NEXT:    snez s0, a7
-; RISCV32-NEXT:    snez s1, a6
-; RISCV32-NEXT:    mulhu a6, a6, a1
-; RISCV32-NEXT:    mulhu a4, a7, a4
-; RISCV32-NEXT:    snez a7, t5
-; RISCV32-NEXT:    snez t4, t4
-; RISCV32-NEXT:    and t5, s4, s3
-; RISCV32-NEXT:    snez a2, a2
-; RISCV32-NEXT:    snez a5, a5
-; RISCV32-NEXT:    and s0, s1, s0
-; RISCV32-NEXT:    snez a6, a6
-; RISCV32-NEXT:    snez a4, a4
-; RISCV32-NEXT:    and a7, t4, a7
-; RISCV32-NEXT:    or a2, t5, a2
-; RISCV32-NEXT:    or a6, s0, a6
-; RISCV32-NEXT:    or a2, a2, a5
-; RISCV32-NEXT:    or a4, a6, a4
-; RISCV32-NEXT:    or a2, a2, s2
-; RISCV32-NEXT:    or a4, a4, t0
-; RISCV32-NEXT:    or a4, a7, a4
-; RISCV32-NEXT:    or a2, a4, a2
-; RISCV32-NEXT:    or t4, a2, t6
-; RISCV32-NEXT:    j .LBB0_14
-; RISCV32-NEXT:  .LBB0_5: # %overflow.no.lhs
-; RISCV32-NEXT:    or t1, a5, a2
-; RISCV32-NEXT:    beqz t1, .LBB0_13
-; RISCV32-NEXT:  # %bb.6: # %overflow.no.lhs.only
-; RISCV32-NEXT:    mulhu t1, a3, a1
-; RISCV32-NEXT:    mul t6, t0, a1
-; RISCV32-NEXT:    mulhu s0, t0, a1
-; RISCV32-NEXT:    mul t4, a3, a7
-; RISCV32-NEXT:    mulhu t5, a3, a7
-; RISCV32-NEXT:    mul t2, t0, a7
-; RISCV32-NEXT:    mulhu t3, t0, a7
-; RISCV32-NEXT:    mulhu s1, a1, a4
-; RISCV32-NEXT:    mul s2, a1, a6
-; RISCV32-NEXT:    mul a7, a7, a4
-; RISCV32-NEXT:    add s1, s1, s2
-; RISCV32-NEXT:    mulhu s2, a5, a4
-; RISCV32-NEXT:    mul a6, a5, a6
-; RISCV32-NEXT:    add a6, s2, a6
-; RISCV32-NEXT:    mulhu s2, a3, a5
-; RISCV32-NEXT:    add a7, s1, a7
-; RISCV32-NEXT:    mul s1, a2, a4
-; RISCV32-NEXT:    add a6, a6, s1
-; RISCV32-NEXT:    mul s1, t0, a5
-; RISCV32-NEXT:    add t1, t6, t1
-; RISCV32-NEXT:    sltu t6, t1, t6
-; RISCV32-NEXT:    add t6, s0, t6
-; RISCV32-NEXT:    mulhu s0, t0, a5
-; RISCV32-NEXT:    add s2, s1, s2
-; RISCV32-NEXT:    sltu s1, s2, s1
-; RISCV32-NEXT:    add s0, s0, s1
-; RISCV32-NEXT:    mul s1, a3, a2
-; RISCV32-NEXT:    add t1, t4, t1
-; RISCV32-NEXT:    sltu t4, t1, t4
-; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mul t5, t0, a2
-; RISCV32-NEXT:    mulhu t0, t0, a2
-; RISCV32-NEXT:    mulhu a2, a3, a2
-; RISCV32-NEXT:    add s2, s1, s2
-; RISCV32-NEXT:    sltu s1, s2, s1
-; RISCV32-NEXT:    add a2, a2, s1
-; RISCV32-NEXT:    mul s1, a1, a4
-; RISCV32-NEXT:    mul a4, a5, a4
-; RISCV32-NEXT:    mul a5, a3, a5
-; RISCV32-NEXT:    add t4, t6, t4
-; RISCV32-NEXT:    add a2, s0, a2
-; RISCV32-NEXT:    sltu t6, t4, t6
-; RISCV32-NEXT:    add t4, t2, t4
-; RISCV32-NEXT:    sltu s0, a2, s0
-; RISCV32-NEXT:    add s3, t5, a2
-; RISCV32-NEXT:    add s1, t4, s1
-; RISCV32-NEXT:    sltu t2, t4, t2
-; RISCV32-NEXT:    add t3, t3, t6
-; RISCV32-NEXT:    add a2, s3, a4
-; RISCV32-NEXT:    sltu a4, s3, t5
-; RISCV32-NEXT:    add t0, t0, s0
-; RISCV32-NEXT:    sltu t4, s1, t4
-; RISCV32-NEXT:    add t3, t3, t2
-; RISCV32-NEXT:    sltu t5, a2, s3
-; RISCV32-NEXT:    add a4, t0, a4
-; RISCV32-NEXT:    add t2, s1, a5
-; RISCV32-NEXT:    add a7, t3, a7
-; RISCV32-NEXT:    add a5, a4, a6
-; RISCV32-NEXT:    sltu a4, t2, s1
-; RISCV32-NEXT:    add a6, a7, t4
-; RISCV32-NEXT:    add t3, s2, a4
-; RISCV32-NEXT:    add t3, a6, t3
-; RISCV32-NEXT:    add a5, a5, t5
-; RISCV32-NEXT:    beq t3, a6, .LBB0_8
-; RISCV32-NEXT:  # %bb.7: # %overflow.no.lhs.only
-; RISCV32-NEXT:    sltu a4, t3, a6
-; RISCV32-NEXT:  .LBB0_8: # %overflow.no.lhs.only
-; RISCV32-NEXT:    mul a1, a3, a1
-; RISCV32-NEXT:    j .LBB0_12
-; RISCV32-NEXT:  .LBB0_9: # %overflow.no.rhs.only
-; RISCV32-NEXT:    mulhu t1, a1, a3
-; RISCV32-NEXT:    mul t6, a7, a3
-; RISCV32-NEXT:    mulhu s0, a7, a3
-; RISCV32-NEXT:    mul t4, a1, t0
-; RISCV32-NEXT:    mulhu t5, a1, t0
-; RISCV32-NEXT:    mul t2, a7, t0
-; RISCV32-NEXT:    mulhu t3, a7, t0
-; RISCV32-NEXT:    mulhu s1, a3, a5
-; RISCV32-NEXT:    mul s2, a3, a2
-; RISCV32-NEXT:    mul t0, t0, a5
-; RISCV32-NEXT:    add s1, s1, s2
-; RISCV32-NEXT:    mulhu s2, a4, a5
-; RISCV32-NEXT:    mul a2, a4, a2
-; RISCV32-NEXT:    add a2, s2, a2
-; RISCV32-NEXT:    mulhu s2, a1, a4
-; RISCV32-NEXT:    add t0, s1, t0
-; RISCV32-NEXT:    mul s1, a6, a5
-; RISCV32-NEXT:    add s1, a2, s1
-; RISCV32-NEXT:    mul a2, a7, a4
-; RISCV32-NEXT:    add t1, t6, t1
-; RISCV32-NEXT:    sltu t6, t1, t6
-; RISCV32-NEXT:    add t6, s0, t6
 ; RISCV32-NEXT:    mulhu s0, a7, a4
-; RISCV32-NEXT:    add s2, a2, s2
-; RISCV32-NEXT:    sltu a2, s2, a2
-; RISCV32-NEXT:    add a2, s0, a2
-; RISCV32-NEXT:    mul s0, a1, a6
 ; RISCV32-NEXT:    add t1, t4, t1
-; RISCV32-NEXT:    sltu t4, t1, t4
-; RISCV32-NEXT:    add t4, t5, t4
-; RISCV32-NEXT:    mul t5, a7, a6
-; RISCV32-NEXT:    mulhu a7, a7, a6
-; RISCV32-NEXT:    mulhu a6, a1, a6
-; RISCV32-NEXT:    add s2, s0, s2
-; RISCV32-NEXT:    sltu s0, s2, s0
-; RISCV32-NEXT:    add a6, a6, s0
-; RISCV32-NEXT:    mul s0, a3, a5
-; RISCV32-NEXT:    mul a5, a4, a5
-; RISCV32-NEXT:    mul a4, a1, a4
-; RISCV32-NEXT:    add t4, t6, t4
-; RISCV32-NEXT:    add a6, a2, a6
-; RISCV32-NEXT:    sltu t6, t4, t6
-; RISCV32-NEXT:    add t4, t2, t4
-; RISCV32-NEXT:    sltu s3, a6, a2
-; RISCV32-NEXT:    add a6, t5, a6
-; RISCV32-NEXT:    add s0, t4, s0
-; RISCV32-NEXT:    sltu t2, t4, t2
-; RISCV32-NEXT:    add t3, t3, t6
-; RISCV32-NEXT:    add a2, a6, a5
-; RISCV32-NEXT:    sltu a5, a6, t5
-; RISCV32-NEXT:    add a7, a7, s3
-; RISCV32-NEXT:    sltu t4, s0, t4
-; RISCV32-NEXT:    add t3, t3, t2
-; RISCV32-NEXT:    sltu t5, a2, a6
-; RISCV32-NEXT:    add a5, a7, a5
-; RISCV32-NEXT:    add t2, s0, a4
-; RISCV32-NEXT:    add a6, t3, t0
-; RISCV32-NEXT:    add a5, a5, s1
-; RISCV32-NEXT:    sltu a4, t2, s0
-; RISCV32-NEXT:    add a6, a6, t4
-; RISCV32-NEXT:    add t3, s2, a4
-; RISCV32-NEXT:    add t3, a6, t3
-; RISCV32-NEXT:    add a5, a5, t5
-; RISCV32-NEXT:    beq t3, a6, .LBB0_11
-; RISCV32-NEXT:  # %bb.10: # %overflow.no.rhs.only
-; RISCV32-NEXT:    sltu a4, t3, a6
-; RISCV32-NEXT:  .LBB0_11: # %overflow.no.rhs.only
-; RISCV32-NEXT:    mul a1, a1, a3
-; RISCV32-NEXT:  .LBB0_12: # %overflow.res
-; RISCV32-NEXT:    add a4, a2, a4
-; RISCV32-NEXT:    sltu a2, a4, a2
-; RISCV32-NEXT:    add a2, a5, a2
-; RISCV32-NEXT:    or a2, a4, a2
-; RISCV32-NEXT:    snez t4, a2
-; RISCV32-NEXT:    j .LBB0_15
-; RISCV32-NEXT:  .LBB0_13: # %overflow.no
-; RISCV32-NEXT:    li t4, 0
-; RISCV32-NEXT:    mulhu t1, a3, a1
-; RISCV32-NEXT:    mul t2, t0, a1
-; RISCV32-NEXT:    mulhu t3, t0, a1
-; RISCV32-NEXT:    mul t5, a3, a7
-; RISCV32-NEXT:    mulhu t6, a3, a7
-; RISCV32-NEXT:    mul s0, t0, a7
-; RISCV32-NEXT:    mul s1, a5, t0
-; RISCV32-NEXT:    mulhu s2, a5, a3
-; RISCV32-NEXT:    add s1, s2, s1
-; RISCV32-NEXT:    mul s2, a1, a4
-; RISCV32-NEXT:    mul a5, a5, a3
+; RISCV32-NEXT:    sltu t3, t1, t4
+; RISCV32-NEXT:    add t3, t5, t3
+; RISCV32-NEXT:    mulhu t5, a3, a6
+; RISCV32-NEXT:    add t4, s3, t6
+; RISCV32-NEXT:    add s1, s0, s1
+; RISCV32-NEXT:    add t6, t5, s4
+; RISCV32-NEXT:    sltu s3, t4, s3
+; RISCV32-NEXT:    add t3, t2, t3
+; RISCV32-NEXT:    sltu t2, t3, t2
+; RISCV32-NEXT:    add s5, s5, t2
+; RISCV32-NEXT:    add s4, t6, s1
+; RISCV32-NEXT:    add t3, s2, t3
+; RISCV32-NEXT:    add t2, t3, t4
+; RISCV32-NEXT:    sltu s2, t3, s2
+; RISCV32-NEXT:    sltu t4, t2, t3
+; RISCV32-NEXT:    add s2, s5, s2
+; RISCV32-NEXT:    add s3, s4, s3
+; RISCV32-NEXT:    add t3, s2, s3
+; RISCV32-NEXT:    add t3, t3, t4
+; RISCV32-NEXT:    beq t3, s2, .LBB0_2
+; RISCV32-NEXT:  # %bb.1: # %start
+; RISCV32-NEXT:    sltu t4, t3, s2
+; RISCV32-NEXT:  .LBB0_2: # %start
+; RISCV32-NEXT:    sltu s0, s1, s0
+; RISCV32-NEXT:    snez s1, t0
+; RISCV32-NEXT:    snez s2, a2
+; RISCV32-NEXT:    sltu t5, t6, t5
+; RISCV32-NEXT:    mulhu t6, a2, a4
 ; RISCV32-NEXT:    mulhu t0, t0, a7
-; RISCV32-NEXT:    mul a2, a2, a3
-; RISCV32-NEXT:    mul a7, a7, a4
-; RISCV32-NEXT:    mulhu a4, a1, a4
-; RISCV32-NEXT:    mul a6, a1, a6
-; RISCV32-NEXT:    add t1, t2, t1
-; RISCV32-NEXT:    add s2, a5, s2
-; RISCV32-NEXT:    add a4, a4, a6
-; RISCV32-NEXT:    sltu a6, t1, t2
-; RISCV32-NEXT:    add t1, t5, t1
-; RISCV32-NEXT:    add a2, s1, a2
-; RISCV32-NEXT:    add a4, a4, a7
-; RISCV32-NEXT:    sltu a5, s2, a5
-; RISCV32-NEXT:    add a6, t3, a6
-; RISCV32-NEXT:    sltu a7, t1, t5
-; RISCV32-NEXT:    add a2, a2, a4
-; RISCV32-NEXT:    add a7, t6, a7
-; RISCV32-NEXT:    add a2, a2, a5
-; RISCV32-NEXT:    add a7, a6, a7
-; RISCV32-NEXT:    add a4, s0, a7
-; RISCV32-NEXT:    sltu a5, a7, a6
-; RISCV32-NEXT:    add t2, a4, s2
-; RISCV32-NEXT:    sltu a6, a4, s0
-; RISCV32-NEXT:    add a5, t0, a5
-; RISCV32-NEXT:    sltu t3, t2, a4
-; RISCV32-NEXT:    add a5, a5, a6
-; RISCV32-NEXT:    add a2, a5, a2
-; RISCV32-NEXT:    add t3, a2, t3
-; RISCV32-NEXT:  .LBB0_14: # %overflow.res
-; RISCV32-NEXT:    mul a1, a3, a1
-; RISCV32-NEXT:  .LBB0_15: # %overflow.res
-; RISCV32-NEXT:    andi a2, t4, 1
-; RISCV32-NEXT:    sw a1, 0(a0)
+; RISCV32-NEXT:    or a2, a7, a2
+; RISCV32-NEXT:    snez a7, a5
+; RISCV32-NEXT:    mul a4, a4, a6
+; RISCV32-NEXT:    mulhu a6, a1, a6
+; RISCV32-NEXT:    mulhu a5, a5, a3
+; RISCV32-NEXT:    or a3, a3, a1
+; RISCV32-NEXT:    snez a1, a1
+; RISCV32-NEXT:    and s1, s2, s1
+; RISCV32-NEXT:    snez t6, t6
+; RISCV32-NEXT:    snez t0, t0
+; RISCV32-NEXT:    and a1, a1, a7
+; RISCV32-NEXT:    snez a6, a6
+; RISCV32-NEXT:    snez a5, a5
+; RISCV32-NEXT:    snez a2, a2
+; RISCV32-NEXT:    snez a3, a3
+; RISCV32-NEXT:    or a7, s1, t6
+; RISCV32-NEXT:    or a1, a1, a6
+; RISCV32-NEXT:    and a2, a3, a2
+; RISCV32-NEXT:    or a3, a7, t0
+; RISCV32-NEXT:    or a1, a1, a5
+; RISCV32-NEXT:    or a3, a3, s0
+; RISCV32-NEXT:    or a1, a1, t5
+; RISCV32-NEXT:    or a1, a2, a1
+; RISCV32-NEXT:    or a1, a1, a3
+; RISCV32-NEXT:    or a1, a1, t4
+; RISCV32-NEXT:    andi a1, a1, 1
+; RISCV32-NEXT:    sw a4, 0(a0)
 ; RISCV32-NEXT:    sw t1, 4(a0)
 ; RISCV32-NEXT:    sw t2, 8(a0)
 ; RISCV32-NEXT:    sw t3, 12(a0)
-; RISCV32-NEXT:    sb a2, 16(a0)
+; RISCV32-NEXT:    sb a1, 16(a0)
 ; RISCV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s3, 16(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s4, 12(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    lw s5, 8(sp) # 4-byte Folded Reload
-; RISCV32-NEXT:    lw s6, 4(sp) # 4-byte Folded Reload
-; RISCV32-NEXT:    lw s7, 0(sp) # 4-byte Folded Reload
 ; RISCV32-NEXT:    addi sp, sp, 32
 ; RISCV32-NEXT:    ret
 start:
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 5ff0cffb598dc..2751332c9e3ae 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1314,173 +1314,38 @@ entry:
 
 define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: smulo.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    srai a6, a0, 31
-; RV32-NEXT:    srai a5, a2, 31
-; RV32-NEXT:    beq a1, a6, .LBB21_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beq a3, a5, .LBB21_6
-; RV32-NEXT:  # %bb.2: # %overflow
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mulhu a5, a0, a2
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    mulhsu a7, a1, a2
 ; RV32-NEXT:    mul t0, a3, a0
 ; RV32-NEXT:    mulh t1, a1, a3
-; RV32-NEXT:    mul t2, a1, a3
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    mulhsu a3, a3, a0
-; RV32-NEXT:    add a1, a6, a5
-; RV32-NEXT:    sltu a5, a1, a6
-; RV32-NEXT:    add a1, t0, a1
-; RV32-NEXT:    add a5, a7, a5
-; RV32-NEXT:    sltu a6, a1, t0
-; RV32-NEXT:    add a3, a3, a6
-; RV32-NEXT:    srai a6, a5, 31
-; RV32-NEXT:    srai a7, a3, 31
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    srai a7, a1, 31
-; RV32-NEXT:    add a3, a5, a3
-; RV32-NEXT:    sltu a5, a3, a5
-; RV32-NEXT:    add a3, t2, a3
+; RV32-NEXT:    mul a2, a0, a2
 ; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    sltu a6, a3, t2
-; RV32-NEXT:    xor a3, a3, a7
-; RV32-NEXT:    add a5, t1, a5
-; RV32-NEXT:    add a5, a5, a6
-; RV32-NEXT:    xor a5, a5, a7
-; RV32-NEXT:    or a3, a3, a5
-; RV32-NEXT:    snez a5, a3
-; RV32-NEXT:    j .LBB21_9
-; RV32-NEXT:  .LBB21_3: # %overflow.no.lhs
-; RV32-NEXT:    beq a3, a5, .LBB21_8
-; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a1, .LBB21_10
-; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a5, a0
-; RV32-NEXT:    mv a6, a1
-; RV32-NEXT:    bgez a1, .LBB21_11
-; RV32-NEXT:    j .LBB21_12
-; RV32-NEXT:  .LBB21_6: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a3, .LBB21_14
-; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:    mv a6, a3
-; RV32-NEXT:    bgez a3, .LBB21_15
-; RV32-NEXT:    j .LBB21_16
-; RV32-NEXT:  .LBB21_8: # %overflow.no
-; RV32-NEXT:    li a5, 0
-; RV32-NEXT:    mulhu a6, a0, a2
-; RV32-NEXT:    mul a3, a0, a3
+; RV32-NEXT:    sltu a0, a5, a6
+; RV32-NEXT:    add a5, t0, a5
+; RV32-NEXT:    add a0, a7, a0
+; RV32-NEXT:    sltu a6, a5, t0
+; RV32-NEXT:    srai a7, a5, 31
+; RV32-NEXT:    add a3, a3, a6
+; RV32-NEXT:    srai a6, a0, 31
+; RV32-NEXT:    add t0, a0, a3
+; RV32-NEXT:    srai a3, a3, 31
+; RV32-NEXT:    sltu a0, t0, a0
 ; RV32-NEXT:    add a3, a6, a3
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:  .LBB21_9: # %overflow.res
-; RV32-NEXT:    mul a2, a0, a2
-; RV32-NEXT:    j .LBB21_27
-; RV32-NEXT:  .LBB21_10:
-; RV32-NEXT:    neg a5, a0
-; RV32-NEXT:    snez a6, a0
-; RV32-NEXT:    neg a7, a1
-; RV32-NEXT:    sub a6, a7, a6
-; RV32-NEXT:    bltz a1, .LBB21_12
-; RV32-NEXT:  .LBB21_11: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a6, a1
-; RV32-NEXT:    mv a5, a0
-; RV32-NEXT:  .LBB21_12: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a3, .LBB21_18
-; RV32-NEXT:  # %bb.13: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a7, a2
-; RV32-NEXT:    mv a0, a3
-; RV32-NEXT:    j .LBB21_19
-; RV32-NEXT:  .LBB21_14:
-; RV32-NEXT:    neg a5, a2
-; RV32-NEXT:    snez a6, a2
-; RV32-NEXT:    neg a7, a3
-; RV32-NEXT:    sub a6, a7, a6
-; RV32-NEXT:    bltz a3, .LBB21_16
-; RV32-NEXT:  .LBB21_15: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a6, a3
-; RV32-NEXT:    mv a5, a2
-; RV32-NEXT:  .LBB21_16: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a1, .LBB21_22
-; RV32-NEXT:  # %bb.17: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a7, a0
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    j .LBB21_23
-; RV32-NEXT:  .LBB21_18:
-; RV32-NEXT:    neg a7, a2
-; RV32-NEXT:    snez a0, a2
-; RV32-NEXT:    neg t0, a3
-; RV32-NEXT:    sub a0, t0, a0
-; RV32-NEXT:  .LBB21_19: # %overflow.no.lhs.only
-; RV32-NEXT:    slti a1, a1, 0
-; RV32-NEXT:    slti t0, a3, 0
-; RV32-NEXT:    bltz a3, .LBB21_21
-; RV32-NEXT:  # %bb.20: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a0, a3
-; RV32-NEXT:    mv a7, a2
-; RV32-NEXT:  .LBB21_21: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu a2, a5, a7
-; RV32-NEXT:    mul a3, a6, a7
-; RV32-NEXT:    mul a7, a5, a7
-; RV32-NEXT:    mul a6, a6, a0
-; RV32-NEXT:    mulhu t1, a5, a0
-; RV32-NEXT:    mul a0, a5, a0
-; RV32-NEXT:    xor a1, t0, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a6, t1, a6
-; RV32-NEXT:    neg a3, a1
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    xor a5, a7, a3
-; RV32-NEXT:    sltu a7, a0, a2
-; RV32-NEXT:    add a2, a5, a1
-; RV32-NEXT:    xor a0, a0, a3
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    sltu a5, a2, a1
-; RV32-NEXT:    add a1, a0, a5
-; RV32-NEXT:    sltu a0, a1, a5
-; RV32-NEXT:    xor a3, a6, a3
+; RV32-NEXT:    add t0, a1, t0
 ; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    j .LBB21_26
-; RV32-NEXT:  .LBB21_22:
-; RV32-NEXT:    neg a7, a0
-; RV32-NEXT:    snez a2, a0
-; RV32-NEXT:    neg t0, a1
-; RV32-NEXT:    sub a2, t0, a2
-; RV32-NEXT:  .LBB21_23: # %overflow.no.rhs.only
-; RV32-NEXT:    slti a3, a3, 0
-; RV32-NEXT:    slti t0, a1, 0
-; RV32-NEXT:    bltz a1, .LBB21_25
-; RV32-NEXT:  # %bb.24: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    mv a7, a0
-; RV32-NEXT:  .LBB21_25: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu a0, a5, a7
-; RV32-NEXT:    mul a1, a6, a7
-; RV32-NEXT:    mul a7, a5, a7
-; RV32-NEXT:    mul a6, a6, a2
-; RV32-NEXT:    mulhu t1, a5, a2
-; RV32-NEXT:    mul a2, a5, a2
-; RV32-NEXT:    xor a3, a3, t0
+; RV32-NEXT:    sltu a1, t0, a1
+; RV32-NEXT:    xor a3, t0, a7
+; RV32-NEXT:    add a0, t1, a0
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a6, t1, a6
-; RV32-NEXT:    neg a5, a3
-; RV32-NEXT:    add a1, a0, a2
-; RV32-NEXT:    xor a2, a7, a5
-; RV32-NEXT:    sltu a0, a1, a0
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    xor a1, a1, a5
-; RV32-NEXT:    add a0, a6, a0
-; RV32-NEXT:    sltu a3, a2, a3
-; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    sltu a3, a1, a3
-; RV32-NEXT:    xor a0, a0, a5
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:  .LBB21_26: # %overflow.res
-; RV32-NEXT:    snez a5, a0
-; RV32-NEXT:  .LBB21_27: # %overflow.res
-; RV32-NEXT:    andi a0, a5, 1
+; RV32-NEXT:    xor a0, a0, a7
+; RV32-NEXT:    or a0, a3, a0
+; RV32-NEXT:    snez a0, a0
 ; RV32-NEXT:    sw a2, 0(a4)
-; RV32-NEXT:    sw a1, 4(a4)
+; RV32-NEXT:    sw a5, 4(a4)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.i64:
@@ -1494,173 +1359,38 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    srai a6, a0, 31
-; RV32ZBA-NEXT:    srai a5, a2, 31
-; RV32ZBA-NEXT:    beq a1, a6, .LBB21_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beq a3, a5, .LBB21_6
-; RV32ZBA-NEXT:  # %bb.2: # %overflow
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mulhu a5, a0, a2
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a7, a1, a2
 ; RV32ZBA-NEXT:    mul t0, a3, a0
 ; RV32ZBA-NEXT:    mulh t1, a1, a3
-; RV32ZBA-NEXT:    mul t2, a1, a3
+; RV32ZBA-NEXT:    mul a1, a1, a3
 ; RV32ZBA-NEXT:    mulhsu a3, a3, a0
-; RV32ZBA-NEXT:    add a1, a6, a5
-; RV32ZBA-NEXT:    sltu a5, a1, a6
-; RV32ZBA-NEXT:    add a1, t0, a1
-; RV32ZBA-NEXT:    add a5, a7, a5
-; RV32ZBA-NEXT:    sltu a6, a1, t0
-; RV32ZBA-NEXT:    add a3, a3, a6
-; RV32ZBA-NEXT:    srai a6, a5, 31
-; RV32ZBA-NEXT:    srai a7, a3, 31
-; RV32ZBA-NEXT:    add a6, a6, a7
-; RV32ZBA-NEXT:    srai a7, a1, 31
-; RV32ZBA-NEXT:    add a3, a5, a3
-; RV32ZBA-NEXT:    sltu a5, a3, a5
-; RV32ZBA-NEXT:    add a3, t2, a3
+; RV32ZBA-NEXT:    mul a2, a0, a2
 ; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    sltu a6, a3, t2
-; RV32ZBA-NEXT:    xor a3, a3, a7
-; RV32ZBA-NEXT:    add a5, t1, a5
-; RV32ZBA-NEXT:    add a5, a5, a6
-; RV32ZBA-NEXT:    xor a5, a5, a7
-; RV32ZBA-NEXT:    or a3, a3, a5
-; RV32ZBA-NEXT:    snez a5, a3
-; RV32ZBA-NEXT:    j .LBB21_9
-; RV32ZBA-NEXT:  .LBB21_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beq a3, a5, .LBB21_8
-; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB21_10
-; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a5, a0
-; RV32ZBA-NEXT:    mv a6, a1
-; RV32ZBA-NEXT:    bgez a1, .LBB21_11
-; RV32ZBA-NEXT:    j .LBB21_12
-; RV32ZBA-NEXT:  .LBB21_6: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB21_14
-; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a5, a2
-; RV32ZBA-NEXT:    mv a6, a3
-; RV32ZBA-NEXT:    bgez a3, .LBB21_15
-; RV32ZBA-NEXT:    j .LBB21_16
-; RV32ZBA-NEXT:  .LBB21_8: # %overflow.no
-; RV32ZBA-NEXT:    li a5, 0
-; RV32ZBA-NEXT:    mulhu a6, a0, a2
-; RV32ZBA-NEXT:    mul a3, a0, a3
+; RV32ZBA-NEXT:    sltu a0, a5, a6
+; RV32ZBA-NEXT:    add a5, t0, a5
+; RV32ZBA-NEXT:    add a0, a7, a0
+; RV32ZBA-NEXT:    sltu a6, a5, t0
+; RV32ZBA-NEXT:    srai a7, a5, 31
+; RV32ZBA-NEXT:    add a3, a3, a6
+; RV32ZBA-NEXT:    srai a6, a0, 31
+; RV32ZBA-NEXT:    add t0, a0, a3
+; RV32ZBA-NEXT:    srai a3, a3, 31
+; RV32ZBA-NEXT:    sltu a0, t0, a0
 ; RV32ZBA-NEXT:    add a3, a6, a3
-; RV32ZBA-NEXT:    mul a1, a1, a2
-; RV32ZBA-NEXT:    add a1, a3, a1
-; RV32ZBA-NEXT:  .LBB21_9: # %overflow.res
-; RV32ZBA-NEXT:    mul a2, a0, a2
-; RV32ZBA-NEXT:    j .LBB21_27
-; RV32ZBA-NEXT:  .LBB21_10:
-; RV32ZBA-NEXT:    neg a5, a0
-; RV32ZBA-NEXT:    snez a6, a0
-; RV32ZBA-NEXT:    neg a7, a1
-; RV32ZBA-NEXT:    sub a6, a7, a6
-; RV32ZBA-NEXT:    bltz a1, .LBB21_12
-; RV32ZBA-NEXT:  .LBB21_11: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a6, a1
-; RV32ZBA-NEXT:    mv a5, a0
-; RV32ZBA-NEXT:  .LBB21_12: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB21_18
-; RV32ZBA-NEXT:  # %bb.13: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a7, a2
-; RV32ZBA-NEXT:    mv a0, a3
-; RV32ZBA-NEXT:    j .LBB21_19
-; RV32ZBA-NEXT:  .LBB21_14:
-; RV32ZBA-NEXT:    neg a5, a2
-; RV32ZBA-NEXT:    snez a6, a2
-; RV32ZBA-NEXT:    neg a7, a3
-; RV32ZBA-NEXT:    sub a6, a7, a6
-; RV32ZBA-NEXT:    bltz a3, .LBB21_16
-; RV32ZBA-NEXT:  .LBB21_15: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a6, a3
-; RV32ZBA-NEXT:    mv a5, a2
-; RV32ZBA-NEXT:  .LBB21_16: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB21_22
-; RV32ZBA-NEXT:  # %bb.17: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a7, a0
-; RV32ZBA-NEXT:    mv a2, a1
-; RV32ZBA-NEXT:    j .LBB21_23
-; RV32ZBA-NEXT:  .LBB21_18:
-; RV32ZBA-NEXT:    neg a7, a2
-; RV32ZBA-NEXT:    snez a0, a2
-; RV32ZBA-NEXT:    neg t0, a3
-; RV32ZBA-NEXT:    sub a0, t0, a0
-; RV32ZBA-NEXT:  .LBB21_19: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    slti a1, a1, 0
-; RV32ZBA-NEXT:    slti t0, a3, 0
-; RV32ZBA-NEXT:    bltz a3, .LBB21_21
-; RV32ZBA-NEXT:  # %bb.20: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a0, a3
-; RV32ZBA-NEXT:    mv a7, a2
-; RV32ZBA-NEXT:  .LBB21_21: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu a2, a5, a7
-; RV32ZBA-NEXT:    mul a3, a6, a7
-; RV32ZBA-NEXT:    mul a7, a5, a7
-; RV32ZBA-NEXT:    mul a6, a6, a0
-; RV32ZBA-NEXT:    mulhu t1, a5, a0
-; RV32ZBA-NEXT:    mul a0, a5, a0
-; RV32ZBA-NEXT:    xor a1, t0, a1
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    add a6, t1, a6
-; RV32ZBA-NEXT:    neg a3, a1
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    xor a5, a7, a3
-; RV32ZBA-NEXT:    sltu a7, a0, a2
-; RV32ZBA-NEXT:    add a2, a5, a1
-; RV32ZBA-NEXT:    xor a0, a0, a3
-; RV32ZBA-NEXT:    add a6, a6, a7
-; RV32ZBA-NEXT:    sltu a5, a2, a1
-; RV32ZBA-NEXT:    add a1, a0, a5
-; RV32ZBA-NEXT:    sltu a0, a1, a5
-; RV32ZBA-NEXT:    xor a3, a6, a3
+; RV32ZBA-NEXT:    add t0, a1, t0
 ; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    j .LBB21_26
-; RV32ZBA-NEXT:  .LBB21_22:
-; RV32ZBA-NEXT:    neg a7, a0
-; RV32ZBA-NEXT:    snez a2, a0
-; RV32ZBA-NEXT:    neg t0, a1
-; RV32ZBA-NEXT:    sub a2, t0, a2
-; RV32ZBA-NEXT:  .LBB21_23: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    slti a3, a3, 0
-; RV32ZBA-NEXT:    slti t0, a1, 0
-; RV32ZBA-NEXT:    bltz a1, .LBB21_25
-; RV32ZBA-NEXT:  # %bb.24: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a2, a1
-; RV32ZBA-NEXT:    mv a7, a0
-; RV32ZBA-NEXT:  .LBB21_25: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu a0, a5, a7
-; RV32ZBA-NEXT:    mul a1, a6, a7
-; RV32ZBA-NEXT:    mul a7, a5, a7
-; RV32ZBA-NEXT:    mul a6, a6, a2
-; RV32ZBA-NEXT:    mulhu t1, a5, a2
-; RV32ZBA-NEXT:    mul a2, a5, a2
-; RV32ZBA-NEXT:    xor a3, a3, t0
+; RV32ZBA-NEXT:    sltu a1, t0, a1
+; RV32ZBA-NEXT:    xor a3, t0, a7
+; RV32ZBA-NEXT:    add a0, t1, a0
 ; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a6, t1, a6
-; RV32ZBA-NEXT:    neg a5, a3
-; RV32ZBA-NEXT:    add a1, a0, a2
-; RV32ZBA-NEXT:    xor a2, a7, a5
-; RV32ZBA-NEXT:    sltu a0, a1, a0
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    xor a1, a1, a5
-; RV32ZBA-NEXT:    add a0, a6, a0
-; RV32ZBA-NEXT:    sltu a3, a2, a3
-; RV32ZBA-NEXT:    add a1, a1, a3
-; RV32ZBA-NEXT:    sltu a3, a1, a3
-; RV32ZBA-NEXT:    xor a0, a0, a5
-; RV32ZBA-NEXT:    add a0, a0, a3
-; RV32ZBA-NEXT:  .LBB21_26: # %overflow.res
-; RV32ZBA-NEXT:    snez a5, a0
-; RV32ZBA-NEXT:  .LBB21_27: # %overflow.res
-; RV32ZBA-NEXT:    andi a0, a5, 1
+; RV32ZBA-NEXT:    xor a0, a0, a7
+; RV32ZBA-NEXT:    or a0, a3, a0
+; RV32ZBA-NEXT:    snez a0, a0
 ; RV32ZBA-NEXT:    sw a2, 0(a4)
-; RV32ZBA-NEXT:    sw a1, 4(a4)
+; RV32ZBA-NEXT:    sw a5, 4(a4)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.i64:
@@ -1674,165 +1404,38 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    srai a6, a0, 31
-; RV32ZICOND-NEXT:    srai a5, a2, 31
-; RV32ZICOND-NEXT:    beq a1, a6, .LBB21_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beq a3, a5, .LBB21_5
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a2
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a7, a1, a2
 ; RV32ZICOND-NEXT:    mul t0, a3, a0
 ; RV32ZICOND-NEXT:    mulh t1, a1, a3
-; RV32ZICOND-NEXT:    mul t2, a1, a3
+; RV32ZICOND-NEXT:    mul a1, a1, a3
 ; RV32ZICOND-NEXT:    mulhsu a3, a3, a0
-; RV32ZICOND-NEXT:    add a1, a6, a5
-; RV32ZICOND-NEXT:    sltu a5, a1, a6
-; RV32ZICOND-NEXT:    add a1, t0, a1
-; RV32ZICOND-NEXT:    add a5, a7, a5
-; RV32ZICOND-NEXT:    sltu a6, a1, t0
-; RV32ZICOND-NEXT:    add a3, a3, a6
-; RV32ZICOND-NEXT:    srai a6, a5, 31
-; RV32ZICOND-NEXT:    srai a7, a3, 31
-; RV32ZICOND-NEXT:    add a6, a6, a7
-; RV32ZICOND-NEXT:    srai a7, a1, 31
-; RV32ZICOND-NEXT:    add a3, a5, a3
-; RV32ZICOND-NEXT:    sltu a5, a3, a5
-; RV32ZICOND-NEXT:    add a3, t2, a3
+; RV32ZICOND-NEXT:    mul a2, a0, a2
 ; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    sltu a6, a3, t2
-; RV32ZICOND-NEXT:    xor a3, a3, a7
-; RV32ZICOND-NEXT:    add a5, t1, a5
-; RV32ZICOND-NEXT:    add a5, a5, a6
-; RV32ZICOND-NEXT:    xor a5, a5, a7
-; RV32ZICOND-NEXT:    or a3, a3, a5
-; RV32ZICOND-NEXT:    snez a5, a3
-; RV32ZICOND-NEXT:    j .LBB21_7
-; RV32ZICOND-NEXT:  .LBB21_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beq a3, a5, .LBB21_6
-; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    slti a5, a1, 0
-; RV32ZICOND-NEXT:    neg a6, a0
-; RV32ZICOND-NEXT:    snez a7, a0
-; RV32ZICOND-NEXT:    neg t0, a1
-; RV32ZICOND-NEXT:    snez t1, a2
-; RV32ZICOND-NEXT:    sub a7, t0, a7
-; RV32ZICOND-NEXT:    neg t0, a3
-; RV32ZICOND-NEXT:    sub t0, t0, t1
-; RV32ZICOND-NEXT:    slti t1, a3, 0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a5
-; RV32ZICOND-NEXT:    or a6, a6, a0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
-; RV32ZICOND-NEXT:    or a0, a6, a0
-; RV32ZICOND-NEXT:    neg a6, a2
-; RV32ZICOND-NEXT:    czero.nez a1, a1, a5
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
-; RV32ZICOND-NEXT:    czero.nez a2, a2, t1
-; RV32ZICOND-NEXT:    czero.nez a3, a3, t1
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
-; RV32ZICOND-NEXT:    or a7, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
-; RV32ZICOND-NEXT:    xor a5, t1, a5
-; RV32ZICOND-NEXT:    or a6, a6, a2
-; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
-; RV32ZICOND-NEXT:    or t0, t0, a3
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
-; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
-; RV32ZICOND-NEXT:    neg t1, a5
-; RV32ZICOND-NEXT:    or a2, a6, a2
-; RV32ZICOND-NEXT:    or a1, a7, a1
-; RV32ZICOND-NEXT:    or a3, t0, a3
-; RV32ZICOND-NEXT:    mulhu a6, a0, a2
-; RV32ZICOND-NEXT:    mul a7, a0, a2
-; RV32ZICOND-NEXT:    mul a2, a1, a2
-; RV32ZICOND-NEXT:    mul a1, a1, a3
-; RV32ZICOND-NEXT:    mulhu t0, a0, a3
-; RV32ZICOND-NEXT:    mul a0, a0, a3
-; RV32ZICOND-NEXT:    xor a3, a7, t1
-; RV32ZICOND-NEXT:    add a6, a6, a2
-; RV32ZICOND-NEXT:    add a1, t0, a1
-; RV32ZICOND-NEXT:    add a2, a3, a5
-; RV32ZICOND-NEXT:    add a0, a6, a0
-; RV32ZICOND-NEXT:    sltu a3, a2, a5
-; RV32ZICOND-NEXT:    sltu a5, a0, a6
-; RV32ZICOND-NEXT:    xor a0, a0, t1
-; RV32ZICOND-NEXT:    add a5, a1, a5
-; RV32ZICOND-NEXT:    add a1, a0, a3
-; RV32ZICOND-NEXT:    sltu a0, a1, a3
-; RV32ZICOND-NEXT:    xor a3, a5, t1
-; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    snez a5, a0
-; RV32ZICOND-NEXT:    j .LBB21_8
-; RV32ZICOND-NEXT:  .LBB21_5: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    slti a5, a3, 0
-; RV32ZICOND-NEXT:    neg a6, a2
-; RV32ZICOND-NEXT:    snez a7, a2
-; RV32ZICOND-NEXT:    neg t0, a3
-; RV32ZICOND-NEXT:    snez t1, a0
-; RV32ZICOND-NEXT:    sub a7, t0, a7
-; RV32ZICOND-NEXT:    neg t0, a1
-; RV32ZICOND-NEXT:    sub t0, t0, t1
-; RV32ZICOND-NEXT:    slti t1, a1, 0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a5
-; RV32ZICOND-NEXT:    or a6, a6, a2
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a5
-; RV32ZICOND-NEXT:    or a2, a6, a2
-; RV32ZICOND-NEXT:    neg a6, a0
-; RV32ZICOND-NEXT:    czero.nez a3, a3, a5
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
-; RV32ZICOND-NEXT:    czero.nez a0, a0, t1
-; RV32ZICOND-NEXT:    czero.nez a1, a1, t1
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
-; RV32ZICOND-NEXT:    or a7, a7, a3
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, a5
-; RV32ZICOND-NEXT:    xor a5, a5, t1
-; RV32ZICOND-NEXT:    or a6, a6, a0
-; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
-; RV32ZICOND-NEXT:    or t0, t0, a1
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, t1
-; RV32ZICOND-NEXT:    czero.eqz t0, t0, t1
-; RV32ZICOND-NEXT:    neg t1, a5
-; RV32ZICOND-NEXT:    or a0, a6, a0
-; RV32ZICOND-NEXT:    or a3, a7, a3
-; RV32ZICOND-NEXT:    or a1, t0, a1
-; RV32ZICOND-NEXT:    mulhu a6, a2, a0
-; RV32ZICOND-NEXT:    mul a7, a2, a0
-; RV32ZICOND-NEXT:    mul a0, a3, a0
-; RV32ZICOND-NEXT:    mul a3, a3, a1
-; RV32ZICOND-NEXT:    mulhu t0, a2, a1
-; RV32ZICOND-NEXT:    mul a1, a2, a1
-; RV32ZICOND-NEXT:    xor a2, a7, t1
-; RV32ZICOND-NEXT:    add a0, a6, a0
-; RV32ZICOND-NEXT:    add a3, t0, a3
-; RV32ZICOND-NEXT:    add a2, a2, a5
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a5, a2, a5
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    xor a1, a1, t1
-; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    add a1, a1, a5
-; RV32ZICOND-NEXT:    sltu a3, a1, a5
-; RV32ZICOND-NEXT:    xor a0, a0, t1
-; RV32ZICOND-NEXT:    add a0, a0, a3
-; RV32ZICOND-NEXT:    snez a5, a0
-; RV32ZICOND-NEXT:    j .LBB21_8
-; RV32ZICOND-NEXT:  .LBB21_6: # %overflow.no
-; RV32ZICOND-NEXT:    li a5, 0
-; RV32ZICOND-NEXT:    mulhu a6, a0, a2
-; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    sltu a0, a5, a6
+; RV32ZICOND-NEXT:    add a5, t0, a5
+; RV32ZICOND-NEXT:    add a0, a7, a0
+; RV32ZICOND-NEXT:    sltu a6, a5, t0
+; RV32ZICOND-NEXT:    srai a7, a5, 31
+; RV32ZICOND-NEXT:    add a3, a3, a6
+; RV32ZICOND-NEXT:    srai a6, a0, 31
+; RV32ZICOND-NEXT:    add t0, a0, a3
+; RV32ZICOND-NEXT:    srai a3, a3, 31
+; RV32ZICOND-NEXT:    sltu a0, t0, a0
 ; RV32ZICOND-NEXT:    add a3, a6, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a2
-; RV32ZICOND-NEXT:    add a1, a3, a1
-; RV32ZICOND-NEXT:  .LBB21_7: # %overflow.res
-; RV32ZICOND-NEXT:    mul a2, a0, a2
-; RV32ZICOND-NEXT:  .LBB21_8: # %overflow.res
-; RV32ZICOND-NEXT:    andi a0, a5, 1
+; RV32ZICOND-NEXT:    add t0, a1, t0
+; RV32ZICOND-NEXT:    add a0, a3, a0
+; RV32ZICOND-NEXT:    sltu a1, t0, a1
+; RV32ZICOND-NEXT:    xor a3, t0, a7
+; RV32ZICOND-NEXT:    add a0, t1, a0
+; RV32ZICOND-NEXT:    add a0, a0, a1
+; RV32ZICOND-NEXT:    xor a0, a0, a7
+; RV32ZICOND-NEXT:    or a0, a3, a0
+; RV32ZICOND-NEXT:    snez a0, a0
 ; RV32ZICOND-NEXT:    sw a2, 0(a4)
-; RV32ZICOND-NEXT:    sw a1, 4(a4)
+; RV32ZICOND-NEXT:    sw a5, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.i64:
@@ -1854,57 +1457,23 @@ entry:
 
 define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: smulo2.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    srai a3, a0, 31
-; RV32-NEXT:    beq a1, a3, .LBB22_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    bltz a1, .LBB22_4
-; RV32-NEXT:  # %bb.2: # %overflow.lhs
-; RV32-NEXT:    mv a3, a0
-; RV32-NEXT:    mv a4, a1
-; RV32-NEXT:    bgez a1, .LBB22_5
-; RV32-NEXT:    j .LBB22_6
-; RV32-NEXT:  .LBB22_3: # %overflow.no.lhs
-; RV32-NEXT:    li a4, 0
-; RV32-NEXT:    li a5, 13
-; RV32-NEXT:    mulhu a3, a0, a5
-; RV32-NEXT:    mul a1, a1, a5
-; RV32-NEXT:    add a3, a3, a1
-; RV32-NEXT:    mul a1, a0, a5
-; RV32-NEXT:    j .LBB22_7
-; RV32-NEXT:  .LBB22_4:
-; RV32-NEXT:    neg a3, a0
-; RV32-NEXT:    snez a4, a0
-; RV32-NEXT:    neg a5, a1
-; RV32-NEXT:    sub a4, a5, a4
-; RV32-NEXT:    bltz a1, .LBB22_6
-; RV32-NEXT:  .LBB22_5: # %overflow.lhs
-; RV32-NEXT:    mv a4, a1
-; RV32-NEXT:    mv a3, a0
-; RV32-NEXT:  .LBB22_6: # %overflow.lhs
-; RV32-NEXT:    li a0, 13
-; RV32-NEXT:    mul a5, a3, a0
-; RV32-NEXT:    mulhu a3, a3, a0
-; RV32-NEXT:    mulhu a6, a4, a0
-; RV32-NEXT:    mul a0, a4, a0
-; RV32-NEXT:    srai a4, a1, 31
-; RV32-NEXT:    srli a7, a1, 31
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:    xor a1, a5, a4
-; RV32-NEXT:    sltu a3, a0, a3
-; RV32-NEXT:    add a1, a1, a7
-; RV32-NEXT:    xor a0, a0, a4
-; RV32-NEXT:    add a6, a6, a3
-; RV32-NEXT:    sltu a5, a1, a7
-; RV32-NEXT:    add a3, a0, a5
-; RV32-NEXT:    sltu a0, a3, a5
-; RV32-NEXT:    xor a4, a6, a4
-; RV32-NEXT:    add a0, a4, a0
-; RV32-NEXT:    snez a4, a0
-; RV32-NEXT:  .LBB22_7: # %overflow.res
-; RV32-NEXT:    andi a0, a4, 1
-; RV32-NEXT:    sw a1, 0(a2)
-; RV32-NEXT:    sw a3, 4(a2)
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mulhu a4, a0, a3
+; RV32-NEXT:    mul a5, a1, a3
+; RV32-NEXT:    mulh a1, a1, a3
+; RV32-NEXT:    mul a3, a0, a3
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    sltu a0, a4, a5
+; RV32-NEXT:    srai a5, a4, 31
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    xor a1, a0, a5
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    xor a0, a0, a5
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    sw a3, 0(a2)
+; RV32-NEXT:    sw a4, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.i64:
@@ -1919,61 +1488,25 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo2.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    srai a3, a0, 31
-; RV32ZBA-NEXT:    beq a1, a3, .LBB22_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    bltz a1, .LBB22_4
-; RV32ZBA-NEXT:  # %bb.2: # %overflow.lhs
-; RV32ZBA-NEXT:    mv a3, a0
-; RV32ZBA-NEXT:    mv a4, a1
-; RV32ZBA-NEXT:    bgez a1, .LBB22_5
-; RV32ZBA-NEXT:    j .LBB22_6
-; RV32ZBA-NEXT:  .LBB22_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    li a3, 0
+; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    li a3, 13
 ; RV32ZBA-NEXT:    sh1add a4, a1, a1
+; RV32ZBA-NEXT:    sh1add a5, a0, a0
 ; RV32ZBA-NEXT:    sh2add a4, a4, a1
-; RV32ZBA-NEXT:    li a1, 13
-; RV32ZBA-NEXT:    mulhu a1, a0, a1
-; RV32ZBA-NEXT:    add a4, a1, a4
-; RV32ZBA-NEXT:    sh1add a1, a0, a0
-; RV32ZBA-NEXT:    sh2add a1, a1, a0
-; RV32ZBA-NEXT:    j .LBB22_7
-; RV32ZBA-NEXT:  .LBB22_4:
-; RV32ZBA-NEXT:    neg a3, a0
-; RV32ZBA-NEXT:    snez a4, a0
-; RV32ZBA-NEXT:    neg a5, a1
-; RV32ZBA-NEXT:    sub a4, a5, a4
-; RV32ZBA-NEXT:    bltz a1, .LBB22_6
-; RV32ZBA-NEXT:  .LBB22_5: # %overflow.lhs
-; RV32ZBA-NEXT:    mv a4, a1
-; RV32ZBA-NEXT:    mv a3, a0
-; RV32ZBA-NEXT:  .LBB22_6: # %overflow.lhs
-; RV32ZBA-NEXT:    sh1add a0, a3, a3
-; RV32ZBA-NEXT:    li a5, 13
-; RV32ZBA-NEXT:    sh1add a6, a4, a4
-; RV32ZBA-NEXT:    sh2add a0, a0, a3
-; RV32ZBA-NEXT:    mulhu a3, a3, a5
-; RV32ZBA-NEXT:    sh2add a6, a6, a4
-; RV32ZBA-NEXT:    mulhu a4, a4, a5
-; RV32ZBA-NEXT:    srai a5, a1, 31
-; RV32ZBA-NEXT:    srli a7, a1, 31
-; RV32ZBA-NEXT:    add a6, a3, a6
-; RV32ZBA-NEXT:    xor a0, a0, a5
-; RV32ZBA-NEXT:    sltu a3, a6, a3
-; RV32ZBA-NEXT:    add a1, a0, a7
-; RV32ZBA-NEXT:    xor a0, a6, a5
+; RV32ZBA-NEXT:    mulh a1, a1, a3
+; RV32ZBA-NEXT:    mulhu a3, a0, a3
+; RV32ZBA-NEXT:    sh2add a5, a5, a0
 ; RV32ZBA-NEXT:    add a3, a4, a3
-; RV32ZBA-NEXT:    sltu a6, a1, a7
-; RV32ZBA-NEXT:    add a4, a0, a6
-; RV32ZBA-NEXT:    sltu a0, a4, a6
-; RV32ZBA-NEXT:    xor a3, a3, a5
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    snez a3, a0
-; RV32ZBA-NEXT:  .LBB22_7: # %overflow.res
-; RV32ZBA-NEXT:    andi a0, a3, 1
-; RV32ZBA-NEXT:    sw a1, 0(a2)
-; RV32ZBA-NEXT:    sw a4, 4(a2)
+; RV32ZBA-NEXT:    sltu a0, a3, a4
+; RV32ZBA-NEXT:    srai a4, a3, 31
+; RV32ZBA-NEXT:    add a0, a1, a0
+; RV32ZBA-NEXT:    xor a1, a0, a4
+; RV32ZBA-NEXT:    srai a0, a0, 31
+; RV32ZBA-NEXT:    xor a0, a0, a4
+; RV32ZBA-NEXT:    or a0, a1, a0
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    sw a5, 0(a2)
+; RV32ZBA-NEXT:    sw a3, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.i64:
@@ -1989,56 +1522,23 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo2.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    srai a3, a0, 31
-; RV32ZICOND-NEXT:    beq a1, a3, .LBB22_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    slti a3, a1, 0
-; RV32ZICOND-NEXT:    neg a4, a0
-; RV32ZICOND-NEXT:    snez a5, a0
-; RV32ZICOND-NEXT:    neg a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a4, a4, a3
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a3
-; RV32ZICOND-NEXT:    sub a5, a6, a5
-; RV32ZICOND-NEXT:    czero.nez a6, a1, a3
-; RV32ZICOND-NEXT:    or a4, a4, a0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a3
-; RV32ZICOND-NEXT:    or a5, a5, a6
-; RV32ZICOND-NEXT:    czero.eqz a4, a4, a3
-; RV32ZICOND-NEXT:    czero.eqz a3, a5, a3
-; RV32ZICOND-NEXT:    li a5, 13
-; RV32ZICOND-NEXT:    or a0, a4, a0
-; RV32ZICOND-NEXT:    or a3, a3, a6
-; RV32ZICOND-NEXT:    mul a4, a0, a5
-; RV32ZICOND-NEXT:    mulhu a0, a0, a5
-; RV32ZICOND-NEXT:    mulhu a6, a3, a5
-; RV32ZICOND-NEXT:    mul a3, a3, a5
-; RV32ZICOND-NEXT:    srai a5, a1, 31
-; RV32ZICOND-NEXT:    srli a7, a1, 31
-; RV32ZICOND-NEXT:    xor a1, a4, a5
-; RV32ZICOND-NEXT:    add a3, a0, a3
-; RV32ZICOND-NEXT:    add a1, a1, a7
-; RV32ZICOND-NEXT:    sltu a0, a3, a0
-; RV32ZICOND-NEXT:    sltu a4, a1, a7
-; RV32ZICOND-NEXT:    xor a3, a3, a5
-; RV32ZICOND-NEXT:    add a0, a6, a0
-; RV32ZICOND-NEXT:    add a3, a3, a4
-; RV32ZICOND-NEXT:    sltu a4, a3, a4
+; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND-NEXT:    li a3, 13
+; RV32ZICOND-NEXT:    mulhu a4, a0, a3
+; RV32ZICOND-NEXT:    mul a5, a1, a3
+; RV32ZICOND-NEXT:    mulh a1, a1, a3
+; RV32ZICOND-NEXT:    mul a3, a0, a3
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    sltu a0, a4, a5
+; RV32ZICOND-NEXT:    srai a5, a4, 31
+; RV32ZICOND-NEXT:    add a0, a1, a0
+; RV32ZICOND-NEXT:    xor a1, a0, a5
+; RV32ZICOND-NEXT:    srai a0, a0, 31
 ; RV32ZICOND-NEXT:    xor a0, a0, a5
-; RV32ZICOND-NEXT:    add a0, a0, a4
-; RV32ZICOND-NEXT:    snez a4, a0
-; RV32ZICOND-NEXT:    j .LBB22_3
-; RV32ZICOND-NEXT:  .LBB22_2: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    li a4, 0
-; RV32ZICOND-NEXT:    li a5, 13
-; RV32ZICOND-NEXT:    mulhu a3, a0, a5
-; RV32ZICOND-NEXT:    mul a1, a1, a5
-; RV32ZICOND-NEXT:    add a3, a3, a1
-; RV32ZICOND-NEXT:    mul a1, a0, a5
-; RV32ZICOND-NEXT:  .LBB22_3: # %overflow.res
-; RV32ZICOND-NEXT:    andi a0, a4, 1
-; RV32ZICOND-NEXT:    sw a1, 0(a2)
-; RV32ZICOND-NEXT:    sw a3, 4(a2)
+; RV32ZICOND-NEXT:    or a0, a1, a0
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    sw a3, 0(a2)
+; RV32ZICOND-NEXT:    sw a4, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.i64:
@@ -2266,71 +1766,26 @@ define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
 
 define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV32-LABEL: umulo.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    beqz a1, .LBB26_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beqz a3, .LBB26_5
-; RV32-NEXT:  # %bb.2: # %overflow
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a5, a3, a0
 ; RV32-NEXT:    mul a6, a1, a2
 ; RV32-NEXT:    mulhu a7, a0, a2
 ; RV32-NEXT:    snez t0, a3
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    snez a6, a1
-; RV32-NEXT:    mulhu a1, a1, a2
 ; RV32-NEXT:    mulhu a3, a3, a0
-; RV32-NEXT:    and a6, a6, t0
-; RV32-NEXT:    snez t0, a1
-; RV32-NEXT:    snez a3, a3
-; RV32-NEXT:    add a1, a7, a5
-; RV32-NEXT:    or a5, a6, t0
-; RV32-NEXT:    sltu a6, a1, a7
-; RV32-NEXT:    or a3, a5, a3
-; RV32-NEXT:    or a6, a3, a6
-; RV32-NEXT:    j .LBB26_7
-; RV32-NEXT:  .LBB26_3: # %overflow.no.lhs
-; RV32-NEXT:    beqz a3, .LBB26_6
-; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu a6, a0, a2
-; RV32-NEXT:    mul a7, a1, a2
-; RV32-NEXT:    mul a5, a0, a2
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    mulhu a2, a0, a3
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a2, a2, a1
-; RV32-NEXT:    mul a1, a0, a3
-; RV32-NEXT:    add a1, a6, a1
-; RV32-NEXT:    sltu a0, a1, a6
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    snez a6, a0
-; RV32-NEXT:    j .LBB26_8
-; RV32-NEXT:  .LBB26_5: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu a6, a2, a0
-; RV32-NEXT:    mul a7, a3, a0
-; RV32-NEXT:    mul a5, a2, a0
-; RV32-NEXT:    add a6, a6, a7
-; RV32-NEXT:    mulhu a0, a2, a1
-; RV32-NEXT:    mul a3, a3, a1
-; RV32-NEXT:    add a0, a0, a3
-; RV32-NEXT:    mul a1, a2, a1
-; RV32-NEXT:    add a1, a6, a1
-; RV32-NEXT:    sltu a2, a1, a6
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:    snez a6, a0
-; RV32-NEXT:    j .LBB26_8
-; RV32-NEXT:  .LBB26_6: # %overflow.no
-; RV32-NEXT:    li a6, 0
-; RV32-NEXT:    mulhu a5, a0, a2
-; RV32-NEXT:    mul a3, a0, a3
-; RV32-NEXT:    add a3, a5, a3
-; RV32-NEXT:    mul a1, a1, a2
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:  .LBB26_7: # %overflow.res
-; RV32-NEXT:    mul a5, a0, a2
-; RV32-NEXT:  .LBB26_8: # %overflow.res
-; RV32-NEXT:    andi a0, a6, 1
-; RV32-NEXT:    sw a5, 0(a4)
-; RV32-NEXT:    sw a1, 4(a4)
+; RV32-NEXT:    mul t1, a0, a2
+; RV32-NEXT:    mulhu a0, a1, a2
+; RV32-NEXT:    snez a1, a1
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    and a1, a1, t0
+; RV32-NEXT:    snez a0, a0
+; RV32-NEXT:    snez a2, a3
+; RV32-NEXT:    add a5, a7, a5
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    sltu a1, a5, a7
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    sw t1, 0(a4)
+; RV32-NEXT:    sw a5, 4(a4)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.i64:
@@ -2343,71 +1798,26 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    beqz a1, .LBB26_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB26_5
-; RV32ZBA-NEXT:  # %bb.2: # %overflow
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a5, a3, a0
 ; RV32ZBA-NEXT:    mul a6, a1, a2
 ; RV32ZBA-NEXT:    mulhu a7, a0, a2
 ; RV32ZBA-NEXT:    snez t0, a3
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    snez a6, a1
-; RV32ZBA-NEXT:    mulhu a1, a1, a2
 ; RV32ZBA-NEXT:    mulhu a3, a3, a0
-; RV32ZBA-NEXT:    and a6, a6, t0
-; RV32ZBA-NEXT:    snez t0, a1
-; RV32ZBA-NEXT:    snez a3, a3
-; RV32ZBA-NEXT:    add a1, a7, a5
-; RV32ZBA-NEXT:    or a5, a6, t0
-; RV32ZBA-NEXT:    sltu a6, a1, a7
-; RV32ZBA-NEXT:    or a3, a5, a3
-; RV32ZBA-NEXT:    or a6, a3, a6
-; RV32ZBA-NEXT:    j .LBB26_7
-; RV32ZBA-NEXT:  .LBB26_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB26_6
-; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu a6, a0, a2
-; RV32ZBA-NEXT:    mul a7, a1, a2
-; RV32ZBA-NEXT:    mul a5, a0, a2
-; RV32ZBA-NEXT:    add a6, a6, a7
-; RV32ZBA-NEXT:    mulhu a2, a0, a3
-; RV32ZBA-NEXT:    mul a1, a1, a3
-; RV32ZBA-NEXT:    add a2, a2, a1
-; RV32ZBA-NEXT:    mul a1, a0, a3
-; RV32ZBA-NEXT:    add a1, a6, a1
-; RV32ZBA-NEXT:    sltu a0, a1, a6
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    snez a6, a0
-; RV32ZBA-NEXT:    j .LBB26_8
-; RV32ZBA-NEXT:  .LBB26_5: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu a6, a2, a0
-; RV32ZBA-NEXT:    mul a7, a3, a0
-; RV32ZBA-NEXT:    mul a5, a2, a0
-; RV32ZBA-NEXT:    add a6, a6, a7
-; RV32ZBA-NEXT:    mulhu a0, a2, a1
-; RV32ZBA-NEXT:    mul a3, a3, a1
-; RV32ZBA-NEXT:    add a0, a0, a3
-; RV32ZBA-NEXT:    mul a1, a2, a1
-; RV32ZBA-NEXT:    add a1, a6, a1
-; RV32ZBA-NEXT:    sltu a2, a1, a6
-; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:    snez a6, a0
-; RV32ZBA-NEXT:    j .LBB26_8
-; RV32ZBA-NEXT:  .LBB26_6: # %overflow.no
-; RV32ZBA-NEXT:    li a6, 0
-; RV32ZBA-NEXT:    mulhu a5, a0, a2
-; RV32ZBA-NEXT:    mul a3, a0, a3
-; RV32ZBA-NEXT:    add a3, a5, a3
-; RV32ZBA-NEXT:    mul a1, a1, a2
-; RV32ZBA-NEXT:    add a1, a3, a1
-; RV32ZBA-NEXT:  .LBB26_7: # %overflow.res
-; RV32ZBA-NEXT:    mul a5, a0, a2
-; RV32ZBA-NEXT:  .LBB26_8: # %overflow.res
-; RV32ZBA-NEXT:    andi a0, a6, 1
-; RV32ZBA-NEXT:    sw a5, 0(a4)
-; RV32ZBA-NEXT:    sw a1, 4(a4)
+; RV32ZBA-NEXT:    mul t1, a0, a2
+; RV32ZBA-NEXT:    mulhu a0, a1, a2
+; RV32ZBA-NEXT:    snez a1, a1
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    and a1, a1, t0
+; RV32ZBA-NEXT:    snez a0, a0
+; RV32ZBA-NEXT:    snez a2, a3
+; RV32ZBA-NEXT:    add a5, a7, a5
+; RV32ZBA-NEXT:    or a0, a1, a0
+; RV32ZBA-NEXT:    sltu a1, a5, a7
+; RV32ZBA-NEXT:    or a0, a0, a2
+; RV32ZBA-NEXT:    or a0, a0, a1
+; RV32ZBA-NEXT:    sw t1, 0(a4)
+; RV32ZBA-NEXT:    sw a5, 4(a4)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.i64:
@@ -2420,71 +1830,26 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    beqz a1, .LBB26_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB26_5
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a5, a3, a0
 ; RV32ZICOND-NEXT:    mul a6, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a7, a0, a2
 ; RV32ZICOND-NEXT:    snez t0, a3
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    snez a6, a1
-; RV32ZICOND-NEXT:    mulhu a1, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a3, a3, a0
-; RV32ZICOND-NEXT:    and a6, a6, t0
-; RV32ZICOND-NEXT:    snez t0, a1
-; RV32ZICOND-NEXT:    snez a3, a3
-; RV32ZICOND-NEXT:    add a1, a7, a5
-; RV32ZICOND-NEXT:    or a5, a6, t0
-; RV32ZICOND-NEXT:    sltu a6, a1, a7
-; RV32ZICOND-NEXT:    or a3, a5, a3
-; RV32ZICOND-NEXT:    or a6, a3, a6
-; RV32ZICOND-NEXT:    j .LBB26_7
-; RV32ZICOND-NEXT:  .LBB26_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB26_6
-; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    mulhu a6, a0, a2
-; RV32ZICOND-NEXT:    mul a7, a1, a2
-; RV32ZICOND-NEXT:    mul a5, a0, a2
-; RV32ZICOND-NEXT:    add a6, a6, a7
-; RV32ZICOND-NEXT:    mulhu a2, a0, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a3
-; RV32ZICOND-NEXT:    add a2, a2, a1
-; RV32ZICOND-NEXT:    mul a1, a0, a3
-; RV32ZICOND-NEXT:    add a1, a6, a1
-; RV32ZICOND-NEXT:    sltu a0, a1, a6
-; RV32ZICOND-NEXT:    add a0, a2, a0
-; RV32ZICOND-NEXT:    snez a6, a0
-; RV32ZICOND-NEXT:    j .LBB26_8
-; RV32ZICOND-NEXT:  .LBB26_5: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    mulhu a6, a2, a0
-; RV32ZICOND-NEXT:    mul a7, a3, a0
-; RV32ZICOND-NEXT:    mul a5, a2, a0
-; RV32ZICOND-NEXT:    add a6, a6, a7
-; RV32ZICOND-NEXT:    mulhu a0, a2, a1
-; RV32ZICOND-NEXT:    mul a3, a3, a1
-; RV32ZICOND-NEXT:    add a0, a0, a3
-; RV32ZICOND-NEXT:    mul a1, a2, a1
-; RV32ZICOND-NEXT:    add a1, a6, a1
-; RV32ZICOND-NEXT:    sltu a2, a1, a6
-; RV32ZICOND-NEXT:    add a0, a0, a2
-; RV32ZICOND-NEXT:    snez a6, a0
-; RV32ZICOND-NEXT:    j .LBB26_8
-; RV32ZICOND-NEXT:  .LBB26_6: # %overflow.no
-; RV32ZICOND-NEXT:    li a6, 0
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    mul a3, a0, a3
-; RV32ZICOND-NEXT:    add a3, a5, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a2
-; RV32ZICOND-NEXT:    add a1, a3, a1
-; RV32ZICOND-NEXT:  .LBB26_7: # %overflow.res
-; RV32ZICOND-NEXT:    mul a5, a0, a2
-; RV32ZICOND-NEXT:  .LBB26_8: # %overflow.res
-; RV32ZICOND-NEXT:    andi a0, a6, 1
-; RV32ZICOND-NEXT:    sw a5, 0(a4)
-; RV32ZICOND-NEXT:    sw a1, 4(a4)
+; RV32ZICOND-NEXT:    mul t1, a0, a2
+; RV32ZICOND-NEXT:    mulhu a0, a1, a2
+; RV32ZICOND-NEXT:    snez a1, a1
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    and a1, a1, t0
+; RV32ZICOND-NEXT:    snez a0, a0
+; RV32ZICOND-NEXT:    snez a2, a3
+; RV32ZICOND-NEXT:    add a5, a7, a5
+; RV32ZICOND-NEXT:    or a0, a1, a0
+; RV32ZICOND-NEXT:    sltu a1, a5, a7
+; RV32ZICOND-NEXT:    or a0, a0, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    sw t1, 0(a4)
+; RV32ZICOND-NEXT:    sw a5, 4(a4)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.i64:
@@ -2505,30 +1870,18 @@ entry:
 
 define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV32-LABEL: umulo2.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    beqz a1, .LBB27_2
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    li a4, 13
-; RV32-NEXT:    mul a3, a0, a4
-; RV32-NEXT:    mulhu a0, a0, a4
-; RV32-NEXT:    mulhu a5, a1, a4
-; RV32-NEXT:    mul a1, a1, a4
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    sltu a0, a1, a0
-; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    snez a4, a0
-; RV32-NEXT:    j .LBB27_3
-; RV32-NEXT:  .LBB27_2: # %overflow.no.lhs
-; RV32-NEXT:    li a4, 0
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a3, 13
+; RV32-NEXT:    mul a4, a1, a3
 ; RV32-NEXT:    mulhu a5, a0, a3
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    mulhu a1, a1, a3
 ; RV32-NEXT:    mul a3, a0, a3
-; RV32-NEXT:  .LBB27_3: # %overflow.res
-; RV32-NEXT:    andi a0, a4, 1
+; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    snez a0, a1
+; RV32-NEXT:    sltu a1, a4, a5
+; RV32-NEXT:    or a0, a0, a1
 ; RV32-NEXT:    sw a3, 0(a2)
-; RV32-NEXT:    sw a1, 4(a2)
+; RV32-NEXT:    sw a4, 4(a2)
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo2.i64:
@@ -2542,34 +1895,20 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo2.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    beqz a1, .LBB27_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    sh1add a3, a0, a0
-; RV32ZBA-NEXT:    li a5, 13
-; RV32ZBA-NEXT:    sh1add a6, a1, a1
-; RV32ZBA-NEXT:    sh2add a4, a3, a0
-; RV32ZBA-NEXT:    mulhu a0, a0, a5
-; RV32ZBA-NEXT:    mulhu a3, a1, a5
-; RV32ZBA-NEXT:    sh2add a1, a6, a1
-; RV32ZBA-NEXT:    add a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a1, a0
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:    snez a3, a0
-; RV32ZBA-NEXT:    j .LBB27_3
-; RV32ZBA-NEXT:  .LBB27_2: # %overflow.no.lhs
-; RV32ZBA-NEXT:    li a3, 0
+; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    li a3, 13
 ; RV32ZBA-NEXT:    sh1add a4, a1, a1
-; RV32ZBA-NEXT:    sh2add a1, a4, a1
-; RV32ZBA-NEXT:    li a4, 13
-; RV32ZBA-NEXT:    mulhu a4, a0, a4
-; RV32ZBA-NEXT:    add a1, a4, a1
-; RV32ZBA-NEXT:    sh1add a4, a0, a0
-; RV32ZBA-NEXT:    sh2add a4, a4, a0
-; RV32ZBA-NEXT:  .LBB27_3: # %overflow.res
-; RV32ZBA-NEXT:    andi a0, a3, 1
-; RV32ZBA-NEXT:    sw a4, 0(a2)
-; RV32ZBA-NEXT:    sw a1, 4(a2)
+; RV32ZBA-NEXT:    sh1add a5, a0, a0
+; RV32ZBA-NEXT:    sh2add a4, a4, a1
+; RV32ZBA-NEXT:    mulhu a1, a1, a3
+; RV32ZBA-NEXT:    mulhu a3, a0, a3
+; RV32ZBA-NEXT:    sh2add a5, a5, a0
+; RV32ZBA-NEXT:    add a4, a3, a4
+; RV32ZBA-NEXT:    snez a0, a1
+; RV32ZBA-NEXT:    sltu a1, a4, a3
+; RV32ZBA-NEXT:    or a0, a0, a1
+; RV32ZBA-NEXT:    sw a5, 0(a2)
+; RV32ZBA-NEXT:    sw a4, 4(a2)
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo2.i64:
@@ -2584,30 +1923,18 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo2.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    beqz a1, .LBB27_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    li a4, 13
-; RV32ZICOND-NEXT:    mul a3, a0, a4
-; RV32ZICOND-NEXT:    mulhu a0, a0, a4
-; RV32ZICOND-NEXT:    mulhu a5, a1, a4
-; RV32ZICOND-NEXT:    mul a1, a1, a4
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    add a0, a5, a0
-; RV32ZICOND-NEXT:    snez a4, a0
-; RV32ZICOND-NEXT:    j .LBB27_3
-; RV32ZICOND-NEXT:  .LBB27_2: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    li a4, 0
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    li a3, 13
+; RV32ZICOND-NEXT:    mul a4, a1, a3
 ; RV32ZICOND-NEXT:    mulhu a5, a0, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a3
-; RV32ZICOND-NEXT:    add a1, a5, a1
+; RV32ZICOND-NEXT:    mulhu a1, a1, a3
 ; RV32ZICOND-NEXT:    mul a3, a0, a3
-; RV32ZICOND-NEXT:  .LBB27_3: # %overflow.res
-; RV32ZICOND-NEXT:    andi a0, a4, 1
+; RV32ZICOND-NEXT:    add a4, a5, a4
+; RV32ZICOND-NEXT:    snez a0, a1
+; RV32ZICOND-NEXT:    sltu a1, a4, a5
+; RV32ZICOND-NEXT:    or a0, a0, a1
 ; RV32ZICOND-NEXT:    sw a3, 0(a2)
-; RV32ZICOND-NEXT:    sw a1, 4(a2)
+; RV32ZICOND-NEXT:    sw a4, 4(a2)
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo2.i64:
@@ -3891,13 +3218,7 @@ entry:
 
 define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.select.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    srai a5, a0, 31
-; RV32-NEXT:    srai a4, a2, 31
-; RV32-NEXT:    beq a1, a5, .LBB46_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beq a3, a4, .LBB46_6
-; RV32-NEXT:  # %bb.2: # %overflow
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a6, a1, a2
@@ -3925,119 +3246,11 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a5, a5, a4
 ; RV32-NEXT:    xor a4, a6, a4
 ; RV32-NEXT:    or a4, a4, a5
-; RV32-NEXT:    j .LBB46_26
-; RV32-NEXT:  .LBB46_3: # %overflow.no.lhs
-; RV32-NEXT:    beq a3, a4, .LBB46_8
-; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a1, .LBB46_9
-; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    bgez a1, .LBB46_10
-; RV32-NEXT:    j .LBB46_11
-; RV32-NEXT:  .LBB46_6: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a3, .LBB46_13
-; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a4, a2
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    bgez a3, .LBB46_14
-; RV32-NEXT:    j .LBB46_15
-; RV32-NEXT:  .LBB46_8: # %overflow.no
-; RV32-NEXT:    j .LBB46_27
-; RV32-NEXT:  .LBB46_9:
-; RV32-NEXT:    neg a4, a0
-; RV32-NEXT:    snez a5, a0
-; RV32-NEXT:    neg a6, a1
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    bltz a1, .LBB46_11
-; RV32-NEXT:  .LBB46_10: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:  .LBB46_11: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a3, .LBB46_17
-; RV32-NEXT:  # %bb.12: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a7, a2
-; RV32-NEXT:    mv a6, a3
-; RV32-NEXT:    j .LBB46_18
-; RV32-NEXT:  .LBB46_13:
-; RV32-NEXT:    neg a4, a2
-; RV32-NEXT:    snez a5, a2
-; RV32-NEXT:    neg a6, a3
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    bltz a3, .LBB46_15
-; RV32-NEXT:  .LBB46_14: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    mv a4, a2
-; RV32-NEXT:  .LBB46_15: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a1, .LBB46_21
-; RV32-NEXT:  # %bb.16: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a7, a0
-; RV32-NEXT:    mv a6, a1
-; RV32-NEXT:    j .LBB46_22
-; RV32-NEXT:  .LBB46_17:
-; RV32-NEXT:    neg a7, a2
-; RV32-NEXT:    snez a6, a2
-; RV32-NEXT:    neg t0, a3
-; RV32-NEXT:    sub a6, t0, a6
-; RV32-NEXT:  .LBB46_18: # %overflow.no.lhs.only
-; RV32-NEXT:    slti t0, a1, 0
-; RV32-NEXT:    slti t1, a3, 0
-; RV32-NEXT:    bltz a3, .LBB46_20
-; RV32-NEXT:  # %bb.19: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a6, a3
-; RV32-NEXT:    mv a7, a2
-; RV32-NEXT:  .LBB46_20: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu t2, a4, a7
-; RV32-NEXT:    mul t3, a5, a7
-; RV32-NEXT:    mul a7, a4, a7
-; RV32-NEXT:    mul a5, a5, a6
-; RV32-NEXT:    mulhu t4, a4, a6
-; RV32-NEXT:    mul a4, a4, a6
-; RV32-NEXT:    xor a6, t1, t0
-; RV32-NEXT:    j .LBB46_25
-; RV32-NEXT:  .LBB46_21:
-; RV32-NEXT:    neg a7, a0
-; RV32-NEXT:    snez a6, a0
-; RV32-NEXT:    neg t0, a1
-; RV32-NEXT:    sub a6, t0, a6
-; RV32-NEXT:  .LBB46_22: # %overflow.no.rhs.only
-; RV32-NEXT:    slti t0, a3, 0
-; RV32-NEXT:    slti t1, a1, 0
-; RV32-NEXT:    bltz a1, .LBB46_24
-; RV32-NEXT:  # %bb.23: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a6, a1
-; RV32-NEXT:    mv a7, a0
-; RV32-NEXT:  .LBB46_24: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu t2, a4, a7
-; RV32-NEXT:    mul t3, a5, a7
-; RV32-NEXT:    mul a7, a4, a7
-; RV32-NEXT:    mul a5, a5, a6
-; RV32-NEXT:    mulhu t4, a4, a6
-; RV32-NEXT:    mul a4, a4, a6
-; RV32-NEXT:    xor a6, t0, t1
-; RV32-NEXT:  .LBB46_25: # %overflow.res
-; RV32-NEXT:    add t2, t2, t3
-; RV32-NEXT:    add a5, t4, a5
-; RV32-NEXT:    neg t0, a6
-; RV32-NEXT:    add a4, t2, a4
-; RV32-NEXT:    xor a7, a7, t0
-; RV32-NEXT:    sltu t1, a4, t2
-; RV32-NEXT:    add a7, a7, a6
-; RV32-NEXT:    xor a4, a4, t0
-; RV32-NEXT:    add a5, a5, t1
-; RV32-NEXT:    sltu a6, a7, a6
-; RV32-NEXT:    add a4, a4, a6
-; RV32-NEXT:    sltu a4, a4, a6
-; RV32-NEXT:    xor a5, a5, t0
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:  .LBB46_26: # %overflow.res
-; RV32-NEXT:    snez a4, a4
-; RV32-NEXT:    andi a4, a4, 1
-; RV32-NEXT:    bnez a4, .LBB46_28
-; RV32-NEXT:  .LBB46_27: # %overflow.res
+; RV32-NEXT:    bnez a4, .LBB46_2
+; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:  .LBB46_28: # %overflow.res
+; RV32-NEXT:  .LBB46_2: # %entry
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.select.i64:
@@ -4052,13 +3265,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.select.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    srai a5, a0, 31
-; RV32ZBA-NEXT:    srai a4, a2, 31
-; RV32ZBA-NEXT:    beq a1, a5, .LBB46_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beq a3, a4, .LBB46_6
-; RV32ZBA-NEXT:  # %bb.2: # %overflow
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a6, a1, a2
@@ -4086,119 +3293,11 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a5, a5, a4
 ; RV32ZBA-NEXT:    xor a4, a6, a4
 ; RV32ZBA-NEXT:    or a4, a4, a5
-; RV32ZBA-NEXT:    j .LBB46_26
-; RV32ZBA-NEXT:  .LBB46_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beq a3, a4, .LBB46_8
-; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB46_9
-; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    bgez a1, .LBB46_10
-; RV32ZBA-NEXT:    j .LBB46_11
-; RV32ZBA-NEXT:  .LBB46_6: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB46_13
-; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a4, a2
-; RV32ZBA-NEXT:    mv a5, a3
-; RV32ZBA-NEXT:    bgez a3, .LBB46_14
-; RV32ZBA-NEXT:    j .LBB46_15
-; RV32ZBA-NEXT:  .LBB46_8: # %overflow.no
-; RV32ZBA-NEXT:    j .LBB46_27
-; RV32ZBA-NEXT:  .LBB46_9:
-; RV32ZBA-NEXT:    neg a4, a0
-; RV32ZBA-NEXT:    snez a5, a0
-; RV32ZBA-NEXT:    neg a6, a1
-; RV32ZBA-NEXT:    sub a5, a6, a5
-; RV32ZBA-NEXT:    bltz a1, .LBB46_11
-; RV32ZBA-NEXT:  .LBB46_10: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:  .LBB46_11: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB46_17
-; RV32ZBA-NEXT:  # %bb.12: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a7, a2
-; RV32ZBA-NEXT:    mv a6, a3
-; RV32ZBA-NEXT:    j .LBB46_18
-; RV32ZBA-NEXT:  .LBB46_13:
-; RV32ZBA-NEXT:    neg a4, a2
-; RV32ZBA-NEXT:    snez a5, a2
-; RV32ZBA-NEXT:    neg a6, a3
-; RV32ZBA-NEXT:    sub a5, a6, a5
-; RV32ZBA-NEXT:    bltz a3, .LBB46_15
-; RV32ZBA-NEXT:  .LBB46_14: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a5, a3
-; RV32ZBA-NEXT:    mv a4, a2
-; RV32ZBA-NEXT:  .LBB46_15: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB46_21
-; RV32ZBA-NEXT:  # %bb.16: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a7, a0
-; RV32ZBA-NEXT:    mv a6, a1
-; RV32ZBA-NEXT:    j .LBB46_22
-; RV32ZBA-NEXT:  .LBB46_17:
-; RV32ZBA-NEXT:    neg a7, a2
-; RV32ZBA-NEXT:    snez a6, a2
-; RV32ZBA-NEXT:    neg t0, a3
-; RV32ZBA-NEXT:    sub a6, t0, a6
-; RV32ZBA-NEXT:  .LBB46_18: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    slti t0, a1, 0
-; RV32ZBA-NEXT:    slti t1, a3, 0
-; RV32ZBA-NEXT:    bltz a3, .LBB46_20
-; RV32ZBA-NEXT:  # %bb.19: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a6, a3
-; RV32ZBA-NEXT:    mv a7, a2
-; RV32ZBA-NEXT:  .LBB46_20: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu t2, a4, a7
-; RV32ZBA-NEXT:    mul t3, a5, a7
-; RV32ZBA-NEXT:    mul a7, a4, a7
-; RV32ZBA-NEXT:    mul a5, a5, a6
-; RV32ZBA-NEXT:    mulhu t4, a4, a6
-; RV32ZBA-NEXT:    mul a4, a4, a6
-; RV32ZBA-NEXT:    xor a6, t1, t0
-; RV32ZBA-NEXT:    j .LBB46_25
-; RV32ZBA-NEXT:  .LBB46_21:
-; RV32ZBA-NEXT:    neg a7, a0
-; RV32ZBA-NEXT:    snez a6, a0
-; RV32ZBA-NEXT:    neg t0, a1
-; RV32ZBA-NEXT:    sub a6, t0, a6
-; RV32ZBA-NEXT:  .LBB46_22: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    slti t0, a3, 0
-; RV32ZBA-NEXT:    slti t1, a1, 0
-; RV32ZBA-NEXT:    bltz a1, .LBB46_24
-; RV32ZBA-NEXT:  # %bb.23: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a6, a1
-; RV32ZBA-NEXT:    mv a7, a0
-; RV32ZBA-NEXT:  .LBB46_24: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu t2, a4, a7
-; RV32ZBA-NEXT:    mul t3, a5, a7
-; RV32ZBA-NEXT:    mul a7, a4, a7
-; RV32ZBA-NEXT:    mul a5, a5, a6
-; RV32ZBA-NEXT:    mulhu t4, a4, a6
-; RV32ZBA-NEXT:    mul a4, a4, a6
-; RV32ZBA-NEXT:    xor a6, t0, t1
-; RV32ZBA-NEXT:  .LBB46_25: # %overflow.res
-; RV32ZBA-NEXT:    add t2, t2, t3
-; RV32ZBA-NEXT:    add a5, t4, a5
-; RV32ZBA-NEXT:    neg t0, a6
-; RV32ZBA-NEXT:    add a4, t2, a4
-; RV32ZBA-NEXT:    xor a7, a7, t0
-; RV32ZBA-NEXT:    sltu t1, a4, t2
-; RV32ZBA-NEXT:    add a7, a7, a6
-; RV32ZBA-NEXT:    xor a4, a4, t0
-; RV32ZBA-NEXT:    add a5, a5, t1
-; RV32ZBA-NEXT:    sltu a6, a7, a6
-; RV32ZBA-NEXT:    add a4, a4, a6
-; RV32ZBA-NEXT:    sltu a4, a4, a6
-; RV32ZBA-NEXT:    xor a5, a5, t0
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:  .LBB46_26: # %overflow.res
-; RV32ZBA-NEXT:    snez a4, a4
-; RV32ZBA-NEXT:    andi a4, a4, 1
-; RV32ZBA-NEXT:    bnez a4, .LBB46_28
-; RV32ZBA-NEXT:  .LBB46_27: # %overflow.res
+; RV32ZBA-NEXT:    bnez a4, .LBB46_2
+; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    mv a1, a3
-; RV32ZBA-NEXT:  .LBB46_28: # %overflow.res
+; RV32ZBA-NEXT:  .LBB46_2: # %entry
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.select.i64:
@@ -4213,13 +3312,7 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.select.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    srai a5, a0, 31
-; RV32ZICOND-NEXT:    srai a4, a2, 31
-; RV32ZICOND-NEXT:    beq a1, a5, .LBB46_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beq a3, a4, .LBB46_5
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a6, a1, a2
@@ -4242,99 +3335,11 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    srai a4, a4, 31
 ; RV32ZICOND-NEXT:    add a6, a7, a6
 ; RV32ZICOND-NEXT:    sltu a7, a6, a7
+; RV32ZICOND-NEXT:    xor a6, a6, a4
 ; RV32ZICOND-NEXT:    add a5, t0, a5
 ; RV32ZICOND-NEXT:    add a5, a5, a7
-; RV32ZICOND-NEXT:    xor a5, a5, a4
-; RV32ZICOND-NEXT:    xor a4, a6, a4
-; RV32ZICOND-NEXT:    or a4, a4, a5
-; RV32ZICOND-NEXT:    j .LBB46_7
-; RV32ZICOND-NEXT:  .LBB46_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beq a3, a4, .LBB46_8
-; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    slti a4, a1, 0
-; RV32ZICOND-NEXT:    neg a5, a0
-; RV32ZICOND-NEXT:    snez a6, a0
-; RV32ZICOND-NEXT:    neg a7, a1
-; RV32ZICOND-NEXT:    slti t0, a3, 0
-; RV32ZICOND-NEXT:    neg t1, a2
-; RV32ZICOND-NEXT:    snez t2, a2
-; RV32ZICOND-NEXT:    neg t3, a3
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    czero.nez t4, a0, a4
-; RV32ZICOND-NEXT:    sub a6, a7, a6
-; RV32ZICOND-NEXT:    czero.nez a7, a1, a4
-; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
-; RV32ZICOND-NEXT:    sub t2, t3, t2
-; RV32ZICOND-NEXT:    czero.nez t3, a2, t0
-; RV32ZICOND-NEXT:    or a5, a5, t4
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    or a5, a5, t4
-; RV32ZICOND-NEXT:    czero.nez t4, a3, t0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    or a6, a6, a7
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    xor a4, t0, a4
-; RV32ZICOND-NEXT:    j .LBB46_6
-; RV32ZICOND-NEXT:  .LBB46_5: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    slti a4, a3, 0
-; RV32ZICOND-NEXT:    neg a5, a2
-; RV32ZICOND-NEXT:    snez a6, a2
-; RV32ZICOND-NEXT:    neg a7, a3
-; RV32ZICOND-NEXT:    slti t0, a1, 0
-; RV32ZICOND-NEXT:    neg t1, a0
-; RV32ZICOND-NEXT:    snez t2, a0
-; RV32ZICOND-NEXT:    neg t3, a1
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    czero.nez t4, a2, a4
-; RV32ZICOND-NEXT:    sub a6, a7, a6
-; RV32ZICOND-NEXT:    czero.nez a7, a3, a4
-; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
-; RV32ZICOND-NEXT:    sub t2, t3, t2
-; RV32ZICOND-NEXT:    czero.nez t3, a0, t0
-; RV32ZICOND-NEXT:    or a5, a5, t4
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    or a5, a5, t4
-; RV32ZICOND-NEXT:    czero.nez t4, a1, t0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    or a6, a6, a7
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    xor a4, a4, t0
-; RV32ZICOND-NEXT:  .LBB46_6: # %overflow.res
-; RV32ZICOND-NEXT:    or t1, t1, t3
-; RV32ZICOND-NEXT:    czero.eqz t2, t2, t0
-; RV32ZICOND-NEXT:    or t2, t2, t4
-; RV32ZICOND-NEXT:    czero.eqz t1, t1, t0
-; RV32ZICOND-NEXT:    czero.eqz t0, t2, t0
-; RV32ZICOND-NEXT:    or t1, t1, t3
-; RV32ZICOND-NEXT:    or a6, a6, a7
-; RV32ZICOND-NEXT:    or a7, t0, t4
-; RV32ZICOND-NEXT:    mulhu t0, a5, t1
-; RV32ZICOND-NEXT:    mul t2, a5, t1
-; RV32ZICOND-NEXT:    mul t1, a6, t1
-; RV32ZICOND-NEXT:    mul a6, a6, a7
-; RV32ZICOND-NEXT:    mulhu t3, a5, a7
-; RV32ZICOND-NEXT:    mul a5, a5, a7
-; RV32ZICOND-NEXT:    neg a7, a4
-; RV32ZICOND-NEXT:    xor t2, t2, a7
-; RV32ZICOND-NEXT:    add t0, t0, t1
-; RV32ZICOND-NEXT:    add a6, t3, a6
-; RV32ZICOND-NEXT:    add t2, t2, a4
-; RV32ZICOND-NEXT:    add a5, t0, a5
-; RV32ZICOND-NEXT:    sltu a4, t2, a4
-; RV32ZICOND-NEXT:    sltu t0, a5, t0
-; RV32ZICOND-NEXT:    xor a5, a5, a7
-; RV32ZICOND-NEXT:    add a6, a6, t0
-; RV32ZICOND-NEXT:    add a5, a5, a4
-; RV32ZICOND-NEXT:    sltu a4, a5, a4
-; RV32ZICOND-NEXT:    xor a5, a6, a7
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:  .LBB46_7: # %overflow.res
-; RV32ZICOND-NEXT:    snez a4, a4
-; RV32ZICOND-NEXT:    j .LBB46_9
-; RV32ZICOND-NEXT:  .LBB46_8: # %overflow.no
-; RV32ZICOND-NEXT:    li a4, 0
-; RV32ZICOND-NEXT:  .LBB46_9: # %overflow.res
-; RV32ZICOND-NEXT:    andi a4, a4, 1
+; RV32ZICOND-NEXT:    xor a4, a5, a4
+; RV32ZICOND-NEXT:    or a4, a6, a4
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -4362,13 +3367,7 @@ entry:
 
 define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.not.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    srai a5, a0, 31
-; RV32-NEXT:    srai a4, a2, 31
-; RV32-NEXT:    beq a1, a5, .LBB47_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beq a3, a4, .LBB47_6
-; RV32-NEXT:  # %bb.2: # %overflow
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a2, a1, a2
@@ -4396,128 +3395,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a4, a5, a4
 ; RV32-NEXT:    or a0, a4, a0
-; RV32-NEXT:    j .LBB47_25
-; RV32-NEXT:  .LBB47_3: # %overflow.no.lhs
-; RV32-NEXT:    beq a3, a4, .LBB47_8
-; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a1, .LBB47_9
-; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    bgez a1, .LBB47_10
-; RV32-NEXT:    j .LBB47_11
-; RV32-NEXT:  .LBB47_6: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a3, .LBB47_13
-; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a4, a2
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    bgez a3, .LBB47_14
-; RV32-NEXT:    j .LBB47_15
-; RV32-NEXT:  .LBB47_8: # %overflow.no
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB47_9:
-; RV32-NEXT:    neg a4, a0
-; RV32-NEXT:    snez a5, a0
-; RV32-NEXT:    neg a6, a1
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    bltz a1, .LBB47_11
-; RV32-NEXT:  .LBB47_10: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:  .LBB47_11: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a3, .LBB47_17
-; RV32-NEXT:  # %bb.12: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a6, a2
-; RV32-NEXT:    mv a0, a3
-; RV32-NEXT:    j .LBB47_18
-; RV32-NEXT:  .LBB47_13:
-; RV32-NEXT:    neg a4, a2
-; RV32-NEXT:    snez a5, a2
-; RV32-NEXT:    neg a6, a3
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    bltz a3, .LBB47_15
-; RV32-NEXT:  .LBB47_14: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    mv a4, a2
-; RV32-NEXT:  .LBB47_15: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a1, .LBB47_21
-; RV32-NEXT:  # %bb.16: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a6, a0
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    j .LBB47_22
-; RV32-NEXT:  .LBB47_17:
-; RV32-NEXT:    neg a6, a2
-; RV32-NEXT:    snez a0, a2
-; RV32-NEXT:    neg a7, a3
-; RV32-NEXT:    sub a0, a7, a0
-; RV32-NEXT:  .LBB47_18: # %overflow.no.lhs.only
-; RV32-NEXT:    slti a1, a1, 0
-; RV32-NEXT:    slti a7, a3, 0
-; RV32-NEXT:    bltz a3, .LBB47_20
-; RV32-NEXT:  # %bb.19: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a0, a3
-; RV32-NEXT:    mv a6, a2
-; RV32-NEXT:  .LBB47_20: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu a2, a4, a6
-; RV32-NEXT:    mul a3, a5, a6
-; RV32-NEXT:    mul a6, a4, a6
-; RV32-NEXT:    mul a5, a5, a0
-; RV32-NEXT:    mulhu t0, a4, a0
-; RV32-NEXT:    mul a0, a4, a0
-; RV32-NEXT:    xor a1, a7, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a5, t0, a5
-; RV32-NEXT:    neg a3, a1
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    xor a4, a6, a3
-; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    add a4, a4, a1
-; RV32-NEXT:    xor a0, a0, a3
-; RV32-NEXT:    add a2, a5, a2
-; RV32-NEXT:    sltu a1, a4, a1
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    xor a2, a2, a3
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    j .LBB47_25
-; RV32-NEXT:  .LBB47_21:
-; RV32-NEXT:    neg a6, a0
-; RV32-NEXT:    snez a2, a0
-; RV32-NEXT:    neg a7, a1
-; RV32-NEXT:    sub a2, a7, a2
-; RV32-NEXT:  .LBB47_22: # %overflow.no.rhs.only
-; RV32-NEXT:    slti a3, a3, 0
-; RV32-NEXT:    slti a7, a1, 0
-; RV32-NEXT:    bltz a1, .LBB47_24
-; RV32-NEXT:  # %bb.23: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    mv a6, a0
-; RV32-NEXT:  .LBB47_24: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu a0, a4, a6
-; RV32-NEXT:    mul a1, a5, a6
-; RV32-NEXT:    mul a6, a4, a6
-; RV32-NEXT:    mul a5, a5, a2
-; RV32-NEXT:    mulhu t0, a4, a2
-; RV32-NEXT:    mul a2, a4, a2
-; RV32-NEXT:    xor a3, a3, a7
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a5, t0, a5
-; RV32-NEXT:    neg a1, a3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    xor a4, a6, a1
-; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a4, a4, a3
-; RV32-NEXT:    xor a2, a2, a1
-; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    sltu a3, a4, a3
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    sltu a2, a2, a3
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:  .LBB47_25: # %overflow.res
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    xori a0, a0, 1
+; RV32-NEXT:    seqz a0, a0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.not.i64:
@@ -4530,13 +3408,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.not.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    srai a5, a0, 31
-; RV32ZBA-NEXT:    srai a4, a2, 31
-; RV32ZBA-NEXT:    beq a1, a5, .LBB47_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beq a3, a4, .LBB47_6
-; RV32ZBA-NEXT:  # %bb.2: # %overflow
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a2, a1, a2
@@ -4564,128 +3436,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a4, a5, a4
 ; RV32ZBA-NEXT:    or a0, a4, a0
-; RV32ZBA-NEXT:    j .LBB47_25
-; RV32ZBA-NEXT:  .LBB47_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beq a3, a4, .LBB47_8
-; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB47_9
-; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    bgez a1, .LBB47_10
-; RV32ZBA-NEXT:    j .LBB47_11
-; RV32ZBA-NEXT:  .LBB47_6: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB47_13
-; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a4, a2
-; RV32ZBA-NEXT:    mv a5, a3
-; RV32ZBA-NEXT:    bgez a3, .LBB47_14
-; RV32ZBA-NEXT:    j .LBB47_15
-; RV32ZBA-NEXT:  .LBB47_8: # %overflow.no
-; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB47_9:
-; RV32ZBA-NEXT:    neg a4, a0
-; RV32ZBA-NEXT:    snez a5, a0
-; RV32ZBA-NEXT:    neg a6, a1
-; RV32ZBA-NEXT:    sub a5, a6, a5
-; RV32ZBA-NEXT:    bltz a1, .LBB47_11
-; RV32ZBA-NEXT:  .LBB47_10: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:  .LBB47_11: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB47_17
-; RV32ZBA-NEXT:  # %bb.12: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a6, a2
-; RV32ZBA-NEXT:    mv a0, a3
-; RV32ZBA-NEXT:    j .LBB47_18
-; RV32ZBA-NEXT:  .LBB47_13:
-; RV32ZBA-NEXT:    neg a4, a2
-; RV32ZBA-NEXT:    snez a5, a2
-; RV32ZBA-NEXT:    neg a6, a3
-; RV32ZBA-NEXT:    sub a5, a6, a5
-; RV32ZBA-NEXT:    bltz a3, .LBB47_15
-; RV32ZBA-NEXT:  .LBB47_14: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a5, a3
-; RV32ZBA-NEXT:    mv a4, a2
-; RV32ZBA-NEXT:  .LBB47_15: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB47_21
-; RV32ZBA-NEXT:  # %bb.16: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a6, a0
-; RV32ZBA-NEXT:    mv a2, a1
-; RV32ZBA-NEXT:    j .LBB47_22
-; RV32ZBA-NEXT:  .LBB47_17:
-; RV32ZBA-NEXT:    neg a6, a2
-; RV32ZBA-NEXT:    snez a0, a2
-; RV32ZBA-NEXT:    neg a7, a3
-; RV32ZBA-NEXT:    sub a0, a7, a0
-; RV32ZBA-NEXT:  .LBB47_18: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    slti a1, a1, 0
-; RV32ZBA-NEXT:    slti a7, a3, 0
-; RV32ZBA-NEXT:    bltz a3, .LBB47_20
-; RV32ZBA-NEXT:  # %bb.19: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a0, a3
-; RV32ZBA-NEXT:    mv a6, a2
-; RV32ZBA-NEXT:  .LBB47_20: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu a2, a4, a6
-; RV32ZBA-NEXT:    mul a3, a5, a6
-; RV32ZBA-NEXT:    mul a6, a4, a6
-; RV32ZBA-NEXT:    mul a5, a5, a0
-; RV32ZBA-NEXT:    mulhu t0, a4, a0
-; RV32ZBA-NEXT:    mul a0, a4, a0
-; RV32ZBA-NEXT:    xor a1, a7, a1
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    add a5, t0, a5
-; RV32ZBA-NEXT:    neg a3, a1
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    xor a4, a6, a3
-; RV32ZBA-NEXT:    sltu a2, a0, a2
-; RV32ZBA-NEXT:    add a4, a4, a1
-; RV32ZBA-NEXT:    xor a0, a0, a3
-; RV32ZBA-NEXT:    add a2, a5, a2
-; RV32ZBA-NEXT:    sltu a1, a4, a1
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a0, a1
-; RV32ZBA-NEXT:    xor a2, a2, a3
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    j .LBB47_25
-; RV32ZBA-NEXT:  .LBB47_21:
-; RV32ZBA-NEXT:    neg a6, a0
-; RV32ZBA-NEXT:    snez a2, a0
-; RV32ZBA-NEXT:    neg a7, a1
-; RV32ZBA-NEXT:    sub a2, a7, a2
-; RV32ZBA-NEXT:  .LBB47_22: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    slti a3, a3, 0
-; RV32ZBA-NEXT:    slti a7, a1, 0
-; RV32ZBA-NEXT:    bltz a1, .LBB47_24
-; RV32ZBA-NEXT:  # %bb.23: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a2, a1
-; RV32ZBA-NEXT:    mv a6, a0
-; RV32ZBA-NEXT:  .LBB47_24: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu a0, a4, a6
-; RV32ZBA-NEXT:    mul a1, a5, a6
-; RV32ZBA-NEXT:    mul a6, a4, a6
-; RV32ZBA-NEXT:    mul a5, a5, a2
-; RV32ZBA-NEXT:    mulhu t0, a4, a2
-; RV32ZBA-NEXT:    mul a2, a4, a2
-; RV32ZBA-NEXT:    xor a3, a3, a7
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a5, t0, a5
-; RV32ZBA-NEXT:    neg a1, a3
-; RV32ZBA-NEXT:    add a2, a0, a2
-; RV32ZBA-NEXT:    xor a4, a6, a1
-; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a4, a4, a3
-; RV32ZBA-NEXT:    xor a2, a2, a1
-; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    sltu a3, a4, a3
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    sltu a2, a2, a3
-; RV32ZBA-NEXT:    xor a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:  .LBB47_25: # %overflow.res
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    xori a0, a0, 1
+; RV32ZBA-NEXT:    seqz a0, a0
 ; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.not.i64:
@@ -4698,13 +3449,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.not.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    srai a5, a0, 31
-; RV32ZICOND-NEXT:    srai a4, a2, 31
-; RV32ZICOND-NEXT:    beq a1, a5, .LBB47_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beq a3, a4, .LBB47_5
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
@@ -4732,120 +3477,7 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
 ; RV32ZICOND-NEXT:    xor a4, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a4, a0
-; RV32ZICOND-NEXT:    j .LBB47_6
-; RV32ZICOND-NEXT:  .LBB47_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beq a3, a4, .LBB47_7
-; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    slti a4, a1, 0
-; RV32ZICOND-NEXT:    neg a5, a0
-; RV32ZICOND-NEXT:    snez a6, a0
-; RV32ZICOND-NEXT:    neg a7, a1
-; RV32ZICOND-NEXT:    snez t0, a2
-; RV32ZICOND-NEXT:    sub a6, a7, a6
-; RV32ZICOND-NEXT:    neg a7, a3
-; RV32ZICOND-NEXT:    sub a7, a7, t0
-; RV32ZICOND-NEXT:    slti t0, a3, 0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
-; RV32ZICOND-NEXT:    or a5, a5, a0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    or a0, a5, a0
-; RV32ZICOND-NEXT:    neg a5, a2
-; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.nez a2, a2, t0
-; RV32ZICOND-NEXT:    czero.nez a3, a3, t0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    or a6, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    xor a4, t0, a4
-; RV32ZICOND-NEXT:    or a5, a5, a2
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    or a7, a7, a3
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    neg t0, a4
-; RV32ZICOND-NEXT:    or a2, a5, a2
-; RV32ZICOND-NEXT:    or a1, a6, a1
-; RV32ZICOND-NEXT:    or a3, a7, a3
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    mul a6, a0, a2
-; RV32ZICOND-NEXT:    mul a2, a1, a2
-; RV32ZICOND-NEXT:    mul a1, a1, a3
-; RV32ZICOND-NEXT:    mulhu a7, a0, a3
-; RV32ZICOND-NEXT:    mul a0, a0, a3
-; RV32ZICOND-NEXT:    xor a3, a6, t0
-; RV32ZICOND-NEXT:    add a2, a5, a2
-; RV32ZICOND-NEXT:    add a1, a7, a1
-; RV32ZICOND-NEXT:    add a3, a3, a4
-; RV32ZICOND-NEXT:    add a0, a2, a0
-; RV32ZICOND-NEXT:    sltu a3, a3, a4
-; RV32ZICOND-NEXT:    sltu a2, a0, a2
-; RV32ZICOND-NEXT:    xor a0, a0, t0
-; RV32ZICOND-NEXT:    add a1, a1, a2
-; RV32ZICOND-NEXT:    add a0, a0, a3
-; RV32ZICOND-NEXT:    sltu a0, a0, a3
-; RV32ZICOND-NEXT:    xor a1, a1, t0
-; RV32ZICOND-NEXT:    add a0, a1, a0
-; RV32ZICOND-NEXT:    j .LBB47_6
-; RV32ZICOND-NEXT:  .LBB47_5: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    slti a4, a3, 0
-; RV32ZICOND-NEXT:    neg a5, a2
-; RV32ZICOND-NEXT:    snez a6, a2
-; RV32ZICOND-NEXT:    neg a7, a3
-; RV32ZICOND-NEXT:    snez t0, a0
-; RV32ZICOND-NEXT:    sub a6, a7, a6
-; RV32ZICOND-NEXT:    neg a7, a1
-; RV32ZICOND-NEXT:    sub a7, a7, t0
-; RV32ZICOND-NEXT:    slti t0, a1, 0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
-; RV32ZICOND-NEXT:    or a5, a5, a2
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    or a2, a5, a2
-; RV32ZICOND-NEXT:    neg a5, a0
-; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    czero.nez a1, a1, t0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    or a6, a6, a3
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    xor a4, a4, t0
-; RV32ZICOND-NEXT:    or a5, a5, a0
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    or a7, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    neg t0, a4
-; RV32ZICOND-NEXT:    or a0, a5, a0
-; RV32ZICOND-NEXT:    or a3, a6, a3
-; RV32ZICOND-NEXT:    or a1, a7, a1
-; RV32ZICOND-NEXT:    mulhu a5, a2, a0
-; RV32ZICOND-NEXT:    mul a6, a2, a0
-; RV32ZICOND-NEXT:    mul a0, a3, a0
-; RV32ZICOND-NEXT:    mul a3, a3, a1
-; RV32ZICOND-NEXT:    mulhu a7, a2, a1
-; RV32ZICOND-NEXT:    mul a1, a2, a1
-; RV32ZICOND-NEXT:    xor a2, a6, t0
-; RV32ZICOND-NEXT:    add a0, a5, a0
-; RV32ZICOND-NEXT:    add a3, a7, a3
-; RV32ZICOND-NEXT:    add a2, a2, a4
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a2, a2, a4
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    xor a1, a1, t0
-; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    add a1, a1, a2
-; RV32ZICOND-NEXT:    sltu a1, a1, a2
-; RV32ZICOND-NEXT:    xor a0, a0, t0
-; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:  .LBB47_6: # %overflow.res
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    xori a0, a0, 1
-; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB47_7: # %overflow.no
-; RV32ZICOND-NEXT:    li a0, 1
+; RV32ZICOND-NEXT:    seqz a0, a0
 ; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo.not.i64:
@@ -4985,11 +3617,7 @@ entry:
 
 define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.select.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    beqz a1, .LBB50_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beqz a3, .LBB50_5
-; RV32-NEXT:  # %bb.2: # %overflow
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    snez a6, a3
@@ -5006,42 +3634,12 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    snez a6, a6
 ; RV32-NEXT:    or a5, a5, a6
 ; RV32-NEXT:    or a4, a5, a4
-; RV32-NEXT:    andi a4, a4, 1
-; RV32-NEXT:    beqz a4, .LBB50_7
-; RV32-NEXT:    j .LBB50_8
-; RV32-NEXT:  .LBB50_3: # %overflow.no.lhs
-; RV32-NEXT:    beqz a3, .LBB50_9
-; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu a4, a0, a2
-; RV32-NEXT:    mul a5, a1, a2
-; RV32-NEXT:    mulhu a6, a0, a3
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    mul a5, a1, a3
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    mul a6, a0, a3
-; RV32-NEXT:    j .LBB50_6
-; RV32-NEXT:  .LBB50_5: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu a4, a2, a0
-; RV32-NEXT:    mul a5, a3, a0
-; RV32-NEXT:    mulhu a6, a2, a1
-; RV32-NEXT:    add a4, a4, a5
-; RV32-NEXT:    mul a5, a3, a1
-; RV32-NEXT:    add a5, a6, a5
-; RV32-NEXT:    mul a6, a2, a1
-; RV32-NEXT:  .LBB50_6: # %overflow.res
-; RV32-NEXT:    add a6, a4, a6
-; RV32-NEXT:    sltu a4, a6, a4
-; RV32-NEXT:    add a4, a5, a4
-; RV32-NEXT:    snez a4, a4
-; RV32-NEXT:    andi a4, a4, 1
-; RV32-NEXT:    bnez a4, .LBB50_8
-; RV32-NEXT:  .LBB50_7: # %overflow.res
+; RV32-NEXT:    bnez a4, .LBB50_2
+; RV32-NEXT:  # %bb.1: # %entry
 ; RV32-NEXT:    mv a0, a2
 ; RV32-NEXT:    mv a1, a3
-; RV32-NEXT:  .LBB50_8: # %overflow.res
+; RV32-NEXT:  .LBB50_2: # %entry
 ; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB50_9: # %overflow.no
-; RV32-NEXT:    j .LBB50_7
 ;
 ; RV64-LABEL: umulo.select.i64:
 ; RV64:       # %bb.0: # %entry
@@ -5053,11 +3651,7 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.select.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    beqz a1, .LBB50_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB50_5
-; RV32ZBA-NEXT:  # %bb.2: # %overflow
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    snez a6, a3
@@ -5074,42 +3668,12 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    snez a6, a6
 ; RV32ZBA-NEXT:    or a5, a5, a6
 ; RV32ZBA-NEXT:    or a4, a5, a4
-; RV32ZBA-NEXT:    andi a4, a4, 1
-; RV32ZBA-NEXT:    beqz a4, .LBB50_7
-; RV32ZBA-NEXT:    j .LBB50_8
-; RV32ZBA-NEXT:  .LBB50_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB50_9
-; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu a4, a0, a2
-; RV32ZBA-NEXT:    mul a5, a1, a2
-; RV32ZBA-NEXT:    mulhu a6, a0, a3
-; RV32ZBA-NEXT:    add a4, a4, a5
-; RV32ZBA-NEXT:    mul a5, a1, a3
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    mul a6, a0, a3
-; RV32ZBA-NEXT:    j .LBB50_6
-; RV32ZBA-NEXT:  .LBB50_5: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu a4, a2, a0
-; RV32ZBA-NEXT:    mul a5, a3, a0
-; RV32ZBA-NEXT:    mulhu a6, a2, a1
-; RV32ZBA-NEXT:    add a4, a4, a5
-; RV32ZBA-NEXT:    mul a5, a3, a1
-; RV32ZBA-NEXT:    add a5, a6, a5
-; RV32ZBA-NEXT:    mul a6, a2, a1
-; RV32ZBA-NEXT:  .LBB50_6: # %overflow.res
-; RV32ZBA-NEXT:    add a6, a4, a6
-; RV32ZBA-NEXT:    sltu a4, a6, a4
-; RV32ZBA-NEXT:    add a4, a5, a4
-; RV32ZBA-NEXT:    snez a4, a4
-; RV32ZBA-NEXT:    andi a4, a4, 1
-; RV32ZBA-NEXT:    bnez a4, .LBB50_8
-; RV32ZBA-NEXT:  .LBB50_7: # %overflow.res
+; RV32ZBA-NEXT:    bnez a4, .LBB50_2
+; RV32ZBA-NEXT:  # %bb.1: # %entry
 ; RV32ZBA-NEXT:    mv a0, a2
 ; RV32ZBA-NEXT:    mv a1, a3
-; RV32ZBA-NEXT:  .LBB50_8: # %overflow.res
+; RV32ZBA-NEXT:  .LBB50_2: # %entry
 ; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB50_9: # %overflow.no
-; RV32ZBA-NEXT:    j .LBB50_7
 ;
 ; RV64ZBA-LABEL: umulo.select.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -5121,11 +3685,7 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.select.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    beqz a1, .LBB50_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB50_5
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    snez a6, a3
@@ -5142,36 +3702,6 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    snez a6, a6
 ; RV32ZICOND-NEXT:    or a5, a5, a6
 ; RV32ZICOND-NEXT:    or a4, a5, a4
-; RV32ZICOND-NEXT:    j .LBB50_8
-; RV32ZICOND-NEXT:  .LBB50_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB50_7
-; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    mulhu a4, a0, a2
-; RV32ZICOND-NEXT:    mul a5, a1, a2
-; RV32ZICOND-NEXT:    mulhu a6, a0, a3
-; RV32ZICOND-NEXT:    add a4, a4, a5
-; RV32ZICOND-NEXT:    mul a5, a1, a3
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    mul a6, a0, a3
-; RV32ZICOND-NEXT:    j .LBB50_6
-; RV32ZICOND-NEXT:  .LBB50_5: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    mulhu a4, a2, a0
-; RV32ZICOND-NEXT:    mul a5, a3, a0
-; RV32ZICOND-NEXT:    mulhu a6, a2, a1
-; RV32ZICOND-NEXT:    add a4, a4, a5
-; RV32ZICOND-NEXT:    mul a5, a3, a1
-; RV32ZICOND-NEXT:    add a5, a6, a5
-; RV32ZICOND-NEXT:    mul a6, a2, a1
-; RV32ZICOND-NEXT:  .LBB50_6: # %overflow.res
-; RV32ZICOND-NEXT:    add a6, a4, a6
-; RV32ZICOND-NEXT:    sltu a4, a6, a4
-; RV32ZICOND-NEXT:    add a4, a5, a4
-; RV32ZICOND-NEXT:    snez a4, a4
-; RV32ZICOND-NEXT:    j .LBB50_8
-; RV32ZICOND-NEXT:  .LBB50_7: # %overflow.no
-; RV32ZICOND-NEXT:    li a4, 0
-; RV32ZICOND-NEXT:  .LBB50_8: # %overflow.res
-; RV32ZICOND-NEXT:    andi a4, a4, 1
 ; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
 ; RV32ZICOND-NEXT:    czero.eqz a0, a0, a4
 ; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
@@ -5196,11 +3726,7 @@ entry:
 
 define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.not.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    beqz a1, .LBB51_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beqz a3, .LBB51_5
-; RV32-NEXT:  # %bb.2: # %overflow
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhu a6, a0, a2
@@ -5219,38 +3745,6 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    xori a0, a0, 1
 ; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB51_3: # %overflow.no.lhs
-; RV32-NEXT:    beqz a3, .LBB51_7
-; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu a4, a0, a2
-; RV32-NEXT:    mul a2, a1, a2
-; RV32-NEXT:    add a2, a4, a2
-; RV32-NEXT:    mulhu a4, a0, a3
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, a4, a1
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    j .LBB51_6
-; RV32-NEXT:  .LBB51_5: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu a4, a2, a0
-; RV32-NEXT:    mul a0, a3, a0
-; RV32-NEXT:    add a0, a4, a0
-; RV32-NEXT:    mulhu a4, a2, a1
-; RV32-NEXT:    mul a3, a3, a1
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    mul a1, a2, a1
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    sltu a0, a1, a0
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:  .LBB51_6: # %overflow.no.rhs.only
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    xori a0, a0, 1
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB51_7: # %overflow.no
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.not.i64:
 ; RV64:       # %bb.0: # %entry
@@ -5259,11 +3753,7 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.not.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    beqz a1, .LBB51_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB51_5
-; RV32ZBA-NEXT:  # %bb.2: # %overflow
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhu a6, a0, a2
@@ -5282,38 +3772,6 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    or a0, a0, a2
 ; RV32ZBA-NEXT:    xori a0, a0, 1
 ; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB51_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB51_7
-; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu a4, a0, a2
-; RV32ZBA-NEXT:    mul a2, a1, a2
-; RV32ZBA-NEXT:    add a2, a4, a2
-; RV32ZBA-NEXT:    mulhu a4, a0, a3
-; RV32ZBA-NEXT:    mul a1, a1, a3
-; RV32ZBA-NEXT:    add a1, a4, a1
-; RV32ZBA-NEXT:    mul a0, a0, a3
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    sltu a0, a0, a2
-; RV32ZBA-NEXT:    add a0, a1, a0
-; RV32ZBA-NEXT:    j .LBB51_6
-; RV32ZBA-NEXT:  .LBB51_5: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu a4, a2, a0
-; RV32ZBA-NEXT:    mul a0, a3, a0
-; RV32ZBA-NEXT:    add a0, a4, a0
-; RV32ZBA-NEXT:    mulhu a4, a2, a1
-; RV32ZBA-NEXT:    mul a3, a3, a1
-; RV32ZBA-NEXT:    add a3, a4, a3
-; RV32ZBA-NEXT:    mul a1, a2, a1
-; RV32ZBA-NEXT:    add a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a1, a0
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:  .LBB51_6: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    xori a0, a0, 1
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB51_7: # %overflow.no
-; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.not.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -5322,11 +3780,7 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.not.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    beqz a1, .LBB51_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB51_5
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
@@ -5345,38 +3799,6 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    or a0, a0, a2
 ; RV32ZICOND-NEXT:    xori a0, a0, 1
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB51_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB51_7
-; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    mulhu a4, a0, a2
-; RV32ZICOND-NEXT:    mul a2, a1, a2
-; RV32ZICOND-NEXT:    add a2, a4, a2
-; RV32ZICOND-NEXT:    mulhu a4, a0, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a3
-; RV32ZICOND-NEXT:    add a1, a4, a1
-; RV32ZICOND-NEXT:    mul a0, a0, a3
-; RV32ZICOND-NEXT:    add a0, a2, a0
-; RV32ZICOND-NEXT:    sltu a0, a0, a2
-; RV32ZICOND-NEXT:    add a0, a1, a0
-; RV32ZICOND-NEXT:    j .LBB51_6
-; RV32ZICOND-NEXT:  .LBB51_5: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    mulhu a4, a2, a0
-; RV32ZICOND-NEXT:    mul a0, a3, a0
-; RV32ZICOND-NEXT:    add a0, a4, a0
-; RV32ZICOND-NEXT:    mulhu a4, a2, a1
-; RV32ZICOND-NEXT:    mul a3, a3, a1
-; RV32ZICOND-NEXT:    add a3, a4, a3
-; RV32ZICOND-NEXT:    mul a1, a2, a1
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:  .LBB51_6: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    xori a0, a0, 1
-; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB51_7: # %overflow.no
-; RV32ZICOND-NEXT:    li a0, 1
-; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.not.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -6234,13 +4656,7 @@ continue:
 
 define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: smulo.br.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    srai a5, a0, 31
-; RV32-NEXT:    srai a4, a2, 31
-; RV32-NEXT:    beq a1, a5, .LBB61_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beq a3, a4, .LBB61_6
-; RV32-NEXT:  # %bb.2: # %overflow1
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mulhu a4, a0, a2
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhsu a2, a1, a2
@@ -6268,133 +4684,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    xor a0, a0, a4
 ; RV32-NEXT:    xor a4, a5, a4
 ; RV32-NEXT:    or a0, a4, a0
-; RV32-NEXT:    j .LBB61_26
-; RV32-NEXT:  .LBB61_3: # %overflow.no.lhs
-; RV32-NEXT:    beq a3, a4, .LBB61_8
-; RV32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a1, .LBB61_10
-; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    bgez a1, .LBB61_11
-; RV32-NEXT:    j .LBB61_12
-; RV32-NEXT:  .LBB61_6: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a3, .LBB61_14
-; RV32-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a4, a2
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    bgez a3, .LBB61_15
-; RV32-NEXT:    j .LBB61_16
-; RV32-NEXT:  .LBB61_8: # %overflow.no
-; RV32-NEXT:  .LBB61_9: # %continue
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB61_10:
-; RV32-NEXT:    neg a4, a0
-; RV32-NEXT:    snez a5, a0
-; RV32-NEXT:    neg a6, a1
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    bltz a1, .LBB61_12
-; RV32-NEXT:  .LBB61_11: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:  .LBB61_12: # %overflow.no.lhs.only
-; RV32-NEXT:    bltz a3, .LBB61_18
-; RV32-NEXT:  # %bb.13: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a6, a2
-; RV32-NEXT:    mv a0, a3
-; RV32-NEXT:    j .LBB61_19
-; RV32-NEXT:  .LBB61_14:
-; RV32-NEXT:    neg a4, a2
-; RV32-NEXT:    snez a5, a2
-; RV32-NEXT:    neg a6, a3
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    bltz a3, .LBB61_16
-; RV32-NEXT:  .LBB61_15: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a5, a3
-; RV32-NEXT:    mv a4, a2
-; RV32-NEXT:  .LBB61_16: # %overflow.no.rhs.only
-; RV32-NEXT:    bltz a1, .LBB61_22
-; RV32-NEXT:  # %bb.17: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a6, a0
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    j .LBB61_23
-; RV32-NEXT:  .LBB61_18:
-; RV32-NEXT:    neg a6, a2
-; RV32-NEXT:    snez a0, a2
-; RV32-NEXT:    neg a7, a3
-; RV32-NEXT:    sub a0, a7, a0
-; RV32-NEXT:  .LBB61_19: # %overflow.no.lhs.only
-; RV32-NEXT:    slti a1, a1, 0
-; RV32-NEXT:    slti a7, a3, 0
-; RV32-NEXT:    bltz a3, .LBB61_21
-; RV32-NEXT:  # %bb.20: # %overflow.no.lhs.only
-; RV32-NEXT:    mv a0, a3
-; RV32-NEXT:    mv a6, a2
-; RV32-NEXT:  .LBB61_21: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu a2, a4, a6
-; RV32-NEXT:    mul a3, a5, a6
-; RV32-NEXT:    mul a6, a4, a6
-; RV32-NEXT:    mul a5, a5, a0
-; RV32-NEXT:    mulhu t0, a4, a0
-; RV32-NEXT:    mul a0, a4, a0
-; RV32-NEXT:    xor a1, a7, a1
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    add a5, t0, a5
-; RV32-NEXT:    neg a3, a1
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    xor a4, a6, a3
-; RV32-NEXT:    sltu a2, a0, a2
-; RV32-NEXT:    add a4, a4, a1
-; RV32-NEXT:    xor a0, a0, a3
-; RV32-NEXT:    add a2, a5, a2
-; RV32-NEXT:    sltu a1, a4, a1
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    xor a2, a2, a3
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    j .LBB61_26
-; RV32-NEXT:  .LBB61_22:
-; RV32-NEXT:    neg a6, a0
-; RV32-NEXT:    snez a2, a0
-; RV32-NEXT:    neg a7, a1
-; RV32-NEXT:    sub a2, a7, a2
-; RV32-NEXT:  .LBB61_23: # %overflow.no.rhs.only
-; RV32-NEXT:    slti a3, a3, 0
-; RV32-NEXT:    slti a7, a1, 0
-; RV32-NEXT:    bltz a1, .LBB61_25
-; RV32-NEXT:  # %bb.24: # %overflow.no.rhs.only
-; RV32-NEXT:    mv a2, a1
-; RV32-NEXT:    mv a6, a0
-; RV32-NEXT:  .LBB61_25: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu a0, a4, a6
-; RV32-NEXT:    mul a1, a5, a6
-; RV32-NEXT:    mul a6, a4, a6
-; RV32-NEXT:    mul a5, a5, a2
-; RV32-NEXT:    mulhu t0, a4, a2
-; RV32-NEXT:    mul a2, a4, a2
-; RV32-NEXT:    xor a3, a3, a7
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    add a5, t0, a5
-; RV32-NEXT:    neg a1, a3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    xor a4, a6, a1
-; RV32-NEXT:    sltu a0, a2, a0
-; RV32-NEXT:    add a4, a4, a3
-; RV32-NEXT:    xor a2, a2, a1
-; RV32-NEXT:    add a0, a5, a0
-; RV32-NEXT:    sltu a3, a4, a3
-; RV32-NEXT:    add a2, a2, a3
-; RV32-NEXT:    sltu a2, a2, a3
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    add a0, a0, a2
-; RV32-NEXT:  .LBB61_26: # %overflow.res
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    beqz a0, .LBB61_9
-; RV32-NEXT:  # %bb.27: # %overflow
+; RV32-NEXT:    beqz a0, .LBB61_2
+; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB61_2: # %continue
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -6410,13 +4706,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo.br.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    srai a5, a0, 31
-; RV32ZBA-NEXT:    srai a4, a2, 31
-; RV32ZBA-NEXT:    beq a1, a5, .LBB61_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beq a3, a4, .LBB61_6
-; RV32ZBA-NEXT:  # %bb.2: # %overflow1
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mulhu a4, a0, a2
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhsu a2, a1, a2
@@ -6444,133 +4734,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    xor a0, a0, a4
 ; RV32ZBA-NEXT:    xor a4, a5, a4
 ; RV32ZBA-NEXT:    or a0, a4, a0
-; RV32ZBA-NEXT:    j .LBB61_26
-; RV32ZBA-NEXT:  .LBB61_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beq a3, a4, .LBB61_8
-; RV32ZBA-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB61_10
-; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    bgez a1, .LBB61_11
-; RV32ZBA-NEXT:    j .LBB61_12
-; RV32ZBA-NEXT:  .LBB61_6: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB61_14
-; RV32ZBA-NEXT:  # %bb.7: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a4, a2
-; RV32ZBA-NEXT:    mv a5, a3
-; RV32ZBA-NEXT:    bgez a3, .LBB61_15
-; RV32ZBA-NEXT:    j .LBB61_16
-; RV32ZBA-NEXT:  .LBB61_8: # %overflow.no
-; RV32ZBA-NEXT:  .LBB61_9: # %continue
-; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB61_10:
-; RV32ZBA-NEXT:    neg a4, a0
-; RV32ZBA-NEXT:    snez a5, a0
-; RV32ZBA-NEXT:    neg a6, a1
-; RV32ZBA-NEXT:    sub a5, a6, a5
-; RV32ZBA-NEXT:    bltz a1, .LBB61_12
-; RV32ZBA-NEXT:  .LBB61_11: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:  .LBB61_12: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    bltz a3, .LBB61_18
-; RV32ZBA-NEXT:  # %bb.13: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a6, a2
-; RV32ZBA-NEXT:    mv a0, a3
-; RV32ZBA-NEXT:    j .LBB61_19
-; RV32ZBA-NEXT:  .LBB61_14:
-; RV32ZBA-NEXT:    neg a4, a2
-; RV32ZBA-NEXT:    snez a5, a2
-; RV32ZBA-NEXT:    neg a6, a3
-; RV32ZBA-NEXT:    sub a5, a6, a5
-; RV32ZBA-NEXT:    bltz a3, .LBB61_16
-; RV32ZBA-NEXT:  .LBB61_15: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a5, a3
-; RV32ZBA-NEXT:    mv a4, a2
-; RV32ZBA-NEXT:  .LBB61_16: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    bltz a1, .LBB61_22
-; RV32ZBA-NEXT:  # %bb.17: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a6, a0
-; RV32ZBA-NEXT:    mv a2, a1
-; RV32ZBA-NEXT:    j .LBB61_23
-; RV32ZBA-NEXT:  .LBB61_18:
-; RV32ZBA-NEXT:    neg a6, a2
-; RV32ZBA-NEXT:    snez a0, a2
-; RV32ZBA-NEXT:    neg a7, a3
-; RV32ZBA-NEXT:    sub a0, a7, a0
-; RV32ZBA-NEXT:  .LBB61_19: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    slti a1, a1, 0
-; RV32ZBA-NEXT:    slti a7, a3, 0
-; RV32ZBA-NEXT:    bltz a3, .LBB61_21
-; RV32ZBA-NEXT:  # %bb.20: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mv a0, a3
-; RV32ZBA-NEXT:    mv a6, a2
-; RV32ZBA-NEXT:  .LBB61_21: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu a2, a4, a6
-; RV32ZBA-NEXT:    mul a3, a5, a6
-; RV32ZBA-NEXT:    mul a6, a4, a6
-; RV32ZBA-NEXT:    mul a5, a5, a0
-; RV32ZBA-NEXT:    mulhu t0, a4, a0
-; RV32ZBA-NEXT:    mul a0, a4, a0
-; RV32ZBA-NEXT:    xor a1, a7, a1
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    add a5, t0, a5
-; RV32ZBA-NEXT:    neg a3, a1
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    xor a4, a6, a3
-; RV32ZBA-NEXT:    sltu a2, a0, a2
-; RV32ZBA-NEXT:    add a4, a4, a1
-; RV32ZBA-NEXT:    xor a0, a0, a3
-; RV32ZBA-NEXT:    add a2, a5, a2
-; RV32ZBA-NEXT:    sltu a1, a4, a1
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a0, a1
-; RV32ZBA-NEXT:    xor a2, a2, a3
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    j .LBB61_26
-; RV32ZBA-NEXT:  .LBB61_22:
-; RV32ZBA-NEXT:    neg a6, a0
-; RV32ZBA-NEXT:    snez a2, a0
-; RV32ZBA-NEXT:    neg a7, a1
-; RV32ZBA-NEXT:    sub a2, a7, a2
-; RV32ZBA-NEXT:  .LBB61_23: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    slti a3, a3, 0
-; RV32ZBA-NEXT:    slti a7, a1, 0
-; RV32ZBA-NEXT:    bltz a1, .LBB61_25
-; RV32ZBA-NEXT:  # %bb.24: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mv a2, a1
-; RV32ZBA-NEXT:    mv a6, a0
-; RV32ZBA-NEXT:  .LBB61_25: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu a0, a4, a6
-; RV32ZBA-NEXT:    mul a1, a5, a6
-; RV32ZBA-NEXT:    mul a6, a4, a6
-; RV32ZBA-NEXT:    mul a5, a5, a2
-; RV32ZBA-NEXT:    mulhu t0, a4, a2
-; RV32ZBA-NEXT:    mul a2, a4, a2
-; RV32ZBA-NEXT:    xor a3, a3, a7
-; RV32ZBA-NEXT:    add a0, a0, a1
-; RV32ZBA-NEXT:    add a5, t0, a5
-; RV32ZBA-NEXT:    neg a1, a3
-; RV32ZBA-NEXT:    add a2, a0, a2
-; RV32ZBA-NEXT:    xor a4, a6, a1
-; RV32ZBA-NEXT:    sltu a0, a2, a0
-; RV32ZBA-NEXT:    add a4, a4, a3
-; RV32ZBA-NEXT:    xor a2, a2, a1
-; RV32ZBA-NEXT:    add a0, a5, a0
-; RV32ZBA-NEXT:    sltu a3, a4, a3
-; RV32ZBA-NEXT:    add a2, a2, a3
-; RV32ZBA-NEXT:    sltu a2, a2, a3
-; RV32ZBA-NEXT:    xor a0, a0, a1
-; RV32ZBA-NEXT:    add a0, a0, a2
-; RV32ZBA-NEXT:  .LBB61_26: # %overflow.res
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    andi a0, a0, 1
-; RV32ZBA-NEXT:    beqz a0, .LBB61_9
-; RV32ZBA-NEXT:  # %bb.27: # %overflow
+; RV32ZBA-NEXT:    beqz a0, .LBB61_2
+; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB61_2: # %continue
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -6586,13 +4756,7 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo.br.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    srai a5, a0, 31
-; RV32ZICOND-NEXT:    srai a4, a2, 31
-; RV32ZICOND-NEXT:    beq a1, a5, .LBB61_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beq a3, a4, .LBB61_5
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow1
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mulhu a4, a0, a2
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
@@ -6620,123 +4784,11 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    xor a0, a0, a4
 ; RV32ZICOND-NEXT:    xor a4, a5, a4
 ; RV32ZICOND-NEXT:    or a0, a4, a0
-; RV32ZICOND-NEXT:    j .LBB61_6
-; RV32ZICOND-NEXT:  .LBB61_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beq a3, a4, .LBB61_8
-; RV32ZICOND-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    slti a4, a1, 0
-; RV32ZICOND-NEXT:    neg a5, a0
-; RV32ZICOND-NEXT:    snez a6, a0
-; RV32ZICOND-NEXT:    neg a7, a1
-; RV32ZICOND-NEXT:    snez t0, a2
-; RV32ZICOND-NEXT:    sub a6, a7, a6
-; RV32ZICOND-NEXT:    neg a7, a3
-; RV32ZICOND-NEXT:    sub a7, a7, t0
-; RV32ZICOND-NEXT:    slti t0, a3, 0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a4
-; RV32ZICOND-NEXT:    or a5, a5, a0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    or a0, a5, a0
-; RV32ZICOND-NEXT:    neg a5, a2
-; RV32ZICOND-NEXT:    czero.nez a1, a1, a4
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.nez a2, a2, t0
-; RV32ZICOND-NEXT:    czero.nez a3, a3, t0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    or a6, a6, a1
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    xor a4, t0, a4
-; RV32ZICOND-NEXT:    or a5, a5, a2
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    or a7, a7, a3
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    neg t0, a4
-; RV32ZICOND-NEXT:    or a2, a5, a2
-; RV32ZICOND-NEXT:    or a1, a6, a1
-; RV32ZICOND-NEXT:    or a3, a7, a3
-; RV32ZICOND-NEXT:    mulhu a5, a0, a2
-; RV32ZICOND-NEXT:    mul a6, a0, a2
-; RV32ZICOND-NEXT:    mul a2, a1, a2
-; RV32ZICOND-NEXT:    mul a1, a1, a3
-; RV32ZICOND-NEXT:    mulhu a7, a0, a3
-; RV32ZICOND-NEXT:    mul a0, a0, a3
-; RV32ZICOND-NEXT:    xor a3, a6, t0
-; RV32ZICOND-NEXT:    add a2, a5, a2
-; RV32ZICOND-NEXT:    add a1, a7, a1
-; RV32ZICOND-NEXT:    add a3, a3, a4
-; RV32ZICOND-NEXT:    add a0, a2, a0
-; RV32ZICOND-NEXT:    sltu a3, a3, a4
-; RV32ZICOND-NEXT:    sltu a2, a0, a2
-; RV32ZICOND-NEXT:    xor a0, a0, t0
-; RV32ZICOND-NEXT:    add a1, a1, a2
-; RV32ZICOND-NEXT:    add a0, a0, a3
-; RV32ZICOND-NEXT:    sltu a0, a0, a3
-; RV32ZICOND-NEXT:    xor a1, a1, t0
-; RV32ZICOND-NEXT:    add a0, a1, a0
-; RV32ZICOND-NEXT:    j .LBB61_6
-; RV32ZICOND-NEXT:  .LBB61_5: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    slti a4, a3, 0
-; RV32ZICOND-NEXT:    neg a5, a2
-; RV32ZICOND-NEXT:    snez a6, a2
-; RV32ZICOND-NEXT:    neg a7, a3
-; RV32ZICOND-NEXT:    snez t0, a0
-; RV32ZICOND-NEXT:    sub a6, a7, a6
-; RV32ZICOND-NEXT:    neg a7, a1
-; RV32ZICOND-NEXT:    sub a7, a7, t0
-; RV32ZICOND-NEXT:    slti t0, a1, 0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    czero.nez a2, a2, a4
-; RV32ZICOND-NEXT:    or a5, a5, a2
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a4
-; RV32ZICOND-NEXT:    or a2, a5, a2
-; RV32ZICOND-NEXT:    neg a5, a0
-; RV32ZICOND-NEXT:    czero.nez a3, a3, a4
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.nez a0, a0, t0
-; RV32ZICOND-NEXT:    czero.nez a1, a1, t0
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    or a6, a6, a3
-; RV32ZICOND-NEXT:    czero.eqz a6, a6, a4
-; RV32ZICOND-NEXT:    xor a4, a4, t0
-; RV32ZICOND-NEXT:    or a5, a5, a0
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    or a7, a7, a1
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, t0
-; RV32ZICOND-NEXT:    czero.eqz a7, a7, t0
-; RV32ZICOND-NEXT:    neg t0, a4
-; RV32ZICOND-NEXT:    or a0, a5, a0
-; RV32ZICOND-NEXT:    or a3, a6, a3
-; RV32ZICOND-NEXT:    or a1, a7, a1
-; RV32ZICOND-NEXT:    mulhu a5, a2, a0
-; RV32ZICOND-NEXT:    mul a6, a2, a0
-; RV32ZICOND-NEXT:    mul a0, a3, a0
-; RV32ZICOND-NEXT:    mul a3, a3, a1
-; RV32ZICOND-NEXT:    mulhu a7, a2, a1
-; RV32ZICOND-NEXT:    mul a1, a2, a1
-; RV32ZICOND-NEXT:    xor a2, a6, t0
-; RV32ZICOND-NEXT:    add a0, a5, a0
-; RV32ZICOND-NEXT:    add a3, a7, a3
-; RV32ZICOND-NEXT:    add a2, a2, a4
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a2, a2, a4
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    xor a1, a1, t0
-; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:    add a1, a1, a2
-; RV32ZICOND-NEXT:    sltu a1, a1, a2
-; RV32ZICOND-NEXT:    xor a0, a0, t0
-; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:  .LBB61_6: # %overflow.res
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZICOND-NEXT:    beqz a0, .LBB61_9
-; RV32ZICOND-NEXT:  # %bb.7: # %overflow
+; RV32ZICOND-NEXT:    beqz a0, .LBB61_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB61_8: # %overflow.no
-; RV32ZICOND-NEXT:  .LBB61_9: # %continue
+; RV32ZICOND-NEXT:  .LBB61_2: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
@@ -6767,56 +4819,43 @@ continue:
 
 define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV32-LABEL: smulo2.br.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    srai a2, a0, 31
-; RV32-NEXT:    beq a1, a2, .LBB62_3
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    slti a2, a1, 0
-; RV32-NEXT:    bltz a1, .LBB62_5
-; RV32-NEXT:  # %bb.2: # %overflow.lhs
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    xori a3, a2, 1
-; RV32-NEXT:    bgez a1, .LBB62_6
-; RV32-NEXT:    j .LBB62_7
-; RV32-NEXT:  .LBB62_3: # %overflow.no.lhs
-; RV32-NEXT:  .LBB62_4: # %continue
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB62_5:
-; RV32-NEXT:    neg a4, a0
-; RV32-NEXT:    snez a3, a0
-; RV32-NEXT:    neg a5, a1
-; RV32-NEXT:    sub a5, a5, a3
-; RV32-NEXT:    xori a3, a2, 1
-; RV32-NEXT:    bltz a1, .LBB62_7
-; RV32-NEXT:  .LBB62_6: # %overflow.lhs
-; RV32-NEXT:    mv a5, a1
-; RV32-NEXT:    mv a4, a0
-; RV32-NEXT:  .LBB62_7: # %overflow.lhs
-; RV32-NEXT:    li a0, 13
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    mul a1, a4, a0
-; RV32-NEXT:    mulhu a4, a4, a0
-; RV32-NEXT:    mulhu a6, a5, a0
-; RV32-NEXT:    mul a0, a5, a0
-; RV32-NEXT:    add a0, a4, a0
-; RV32-NEXT:    xor a1, a1, a2
-; RV32-NEXT:    sltu a4, a0, a4
-; RV32-NEXT:    add a1, a1, a3
-; RV32-NEXT:    xor a0, a0, a2
-; RV32-NEXT:    add a4, a6, a4
-; RV32-NEXT:    sltu a1, a1, a3
-; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    sltu a0, a0, a1
-; RV32-NEXT:    xor a2, a4, a2
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a2, -13
+; RV32-NEXT:    neg a3, a0
+; RV32-NEXT:    li a4, -1
+; RV32-NEXT:    mulhu a5, a0, a2
+; RV32-NEXT:    mul a6, a1, a2
+; RV32-NEXT:    mulhsu a2, a1, a2
+; RV32-NEXT:    add a5, a6, a5
+; RV32-NEXT:    sltu a6, a5, a6
+; RV32-NEXT:    sub a5, a5, a0
+; RV32-NEXT:    mulhsu a0, a4, a0
+; RV32-NEXT:    add a2, a2, a6
+; RV32-NEXT:    sltu a3, a5, a3
+; RV32-NEXT:    add a0, a0, a3
+; RV32-NEXT:    srai a3, a2, 31
+; RV32-NEXT:    srai a6, a0, 31
+; RV32-NEXT:    add a3, a3, a6
+; RV32-NEXT:    neg a6, a1
+; RV32-NEXT:    mulh a4, a1, a4
+; RV32-NEXT:    srai a5, a5, 31
 ; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    beqz a0, .LBB62_4
-; RV32-NEXT:  # %bb.8: # %overflow
+; RV32-NEXT:    sltu a2, a0, a2
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    sltu a1, a0, a6
+; RV32-NEXT:    add a2, a4, a2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    xor a1, a1, a5
+; RV32-NEXT:    xor a0, a0, a5
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    beqz a0, .LBB62_2
+; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
+; RV32-NEXT:  .LBB62_2: # %continue
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: smulo2.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -6833,58 +4872,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: smulo2.br.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    srai a2, a0, 31
-; RV32ZBA-NEXT:    beq a1, a2, .LBB62_3
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    slti a2, a1, 0
-; RV32ZBA-NEXT:    bltz a1, .LBB62_5
-; RV32ZBA-NEXT:  # %bb.2: # %overflow.lhs
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    xori a3, a2, 1
-; RV32ZBA-NEXT:    bgez a1, .LBB62_6
-; RV32ZBA-NEXT:    j .LBB62_7
-; RV32ZBA-NEXT:  .LBB62_3: # %overflow.no.lhs
-; RV32ZBA-NEXT:  .LBB62_4: # %continue
-; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB62_5:
-; RV32ZBA-NEXT:    neg a4, a0
-; RV32ZBA-NEXT:    snez a3, a0
-; RV32ZBA-NEXT:    neg a5, a1
-; RV32ZBA-NEXT:    sub a5, a5, a3
-; RV32ZBA-NEXT:    xori a3, a2, 1
-; RV32ZBA-NEXT:    bltz a1, .LBB62_7
-; RV32ZBA-NEXT:  .LBB62_6: # %overflow.lhs
-; RV32ZBA-NEXT:    mv a5, a1
-; RV32ZBA-NEXT:    mv a4, a0
-; RV32ZBA-NEXT:  .LBB62_7: # %overflow.lhs
-; RV32ZBA-NEXT:    sh1add a0, a4, a4
-; RV32ZBA-NEXT:    li a1, 13
-; RV32ZBA-NEXT:    sh1add a6, a5, a5
-; RV32ZBA-NEXT:    addi a2, a2, -1
-; RV32ZBA-NEXT:    sh2add a0, a0, a4
-; RV32ZBA-NEXT:    mulhu a4, a4, a1
-; RV32ZBA-NEXT:    sh2add a6, a6, a5
-; RV32ZBA-NEXT:    mulhu a1, a5, a1
-; RV32ZBA-NEXT:    add a6, a4, a6
-; RV32ZBA-NEXT:    xor a0, a0, a2
-; RV32ZBA-NEXT:    sltu a4, a6, a4
+; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    li a2, -13
+; RV32ZBA-NEXT:    neg a3, a0
+; RV32ZBA-NEXT:    li a4, -1
+; RV32ZBA-NEXT:    mulhu a5, a0, a2
+; RV32ZBA-NEXT:    mul a6, a1, a2
+; RV32ZBA-NEXT:    mulhsu a2, a1, a2
+; RV32ZBA-NEXT:    add a5, a6, a5
+; RV32ZBA-NEXT:    sltu a6, a5, a6
+; RV32ZBA-NEXT:    sub a5, a5, a0
+; RV32ZBA-NEXT:    mulhsu a0, a4, a0
+; RV32ZBA-NEXT:    add a2, a2, a6
+; RV32ZBA-NEXT:    sltu a3, a5, a3
 ; RV32ZBA-NEXT:    add a0, a0, a3
-; RV32ZBA-NEXT:    xor a5, a6, a2
-; RV32ZBA-NEXT:    add a1, a1, a4
-; RV32ZBA-NEXT:    sltu a0, a0, a3
-; RV32ZBA-NEXT:    add a5, a5, a0
-; RV32ZBA-NEXT:    sltu a0, a5, a0
-; RV32ZBA-NEXT:    xor a1, a1, a2
-; RV32ZBA-NEXT:    add a0, a1, a0
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    andi a0, a0, 1
-; RV32ZBA-NEXT:    beqz a0, .LBB62_4
-; RV32ZBA-NEXT:  # %bb.8: # %overflow
+; RV32ZBA-NEXT:    srai a3, a2, 31
+; RV32ZBA-NEXT:    srai a6, a0, 31
+; RV32ZBA-NEXT:    add a3, a3, a6
+; RV32ZBA-NEXT:    neg a6, a1
+; RV32ZBA-NEXT:    mulh a4, a1, a4
+; RV32ZBA-NEXT:    srai a5, a5, 31
+; RV32ZBA-NEXT:    add a0, a2, a0
+; RV32ZBA-NEXT:    sltu a2, a0, a2
+; RV32ZBA-NEXT:    sub a0, a0, a1
+; RV32ZBA-NEXT:    add a2, a3, a2
+; RV32ZBA-NEXT:    sltu a1, a0, a6
+; RV32ZBA-NEXT:    add a2, a4, a2
+; RV32ZBA-NEXT:    add a1, a2, a1
+; RV32ZBA-NEXT:    xor a1, a1, a5
+; RV32ZBA-NEXT:    xor a0, a0, a5
+; RV32ZBA-NEXT:    or a0, a0, a1
+; RV32ZBA-NEXT:    beqz a0, .LBB62_2
+; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
+; RV32ZBA-NEXT:  .LBB62_2: # %continue
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: smulo2.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -6901,54 +4925,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: smulo2.br.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    srai a2, a0, 31
-; RV32ZICOND-NEXT:    beq a1, a2, .LBB62_3
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    slti a2, a1, 0
+; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND-NEXT:    li a2, -13
 ; RV32ZICOND-NEXT:    neg a3, a0
-; RV32ZICOND-NEXT:    snez a4, a0
-; RV32ZICOND-NEXT:    neg a5, a1
-; RV32ZICOND-NEXT:    li a6, 13
-; RV32ZICOND-NEXT:    sub a5, a5, a4
-; RV32ZICOND-NEXT:    xori a4, a2, 1
-; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
-; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
-; RV32ZICOND-NEXT:    czero.nez a1, a1, a2
-; RV32ZICOND-NEXT:    or a3, a3, a0
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a2
-; RV32ZICOND-NEXT:    or a5, a5, a1
-; RV32ZICOND-NEXT:    czero.eqz a3, a3, a2
-; RV32ZICOND-NEXT:    czero.eqz a5, a5, a2
-; RV32ZICOND-NEXT:    addi a2, a2, -1
-; RV32ZICOND-NEXT:    or a0, a3, a0
-; RV32ZICOND-NEXT:    or a1, a5, a1
-; RV32ZICOND-NEXT:    mul a3, a0, a6
-; RV32ZICOND-NEXT:    mulhu a0, a0, a6
-; RV32ZICOND-NEXT:    mulhu a5, a1, a6
-; RV32ZICOND-NEXT:    mul a1, a1, a6
-; RV32ZICOND-NEXT:    xor a3, a3, a2
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    add a3, a3, a4
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    sltu a3, a3, a4
-; RV32ZICOND-NEXT:    xor a1, a1, a2
-; RV32ZICOND-NEXT:    add a0, a5, a0
-; RV32ZICOND-NEXT:    add a1, a1, a3
-; RV32ZICOND-NEXT:    sltu a1, a1, a3
-; RV32ZICOND-NEXT:    xor a0, a0, a2
-; RV32ZICOND-NEXT:    add a0, a0, a1
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZICOND-NEXT:    bnez a0, .LBB62_4
+; RV32ZICOND-NEXT:    li a4, -1
+; RV32ZICOND-NEXT:    mulhu a5, a0, a2
+; RV32ZICOND-NEXT:    mul a6, a1, a2
+; RV32ZICOND-NEXT:    mulhsu a2, a1, a2
+; RV32ZICOND-NEXT:    add a5, a6, a5
+; RV32ZICOND-NEXT:    sltu a6, a5, a6
+; RV32ZICOND-NEXT:    sub a5, a5, a0
+; RV32ZICOND-NEXT:    mulhsu a0, a4, a0
+; RV32ZICOND-NEXT:    add a2, a2, a6
+; RV32ZICOND-NEXT:    sltu a3, a5, a3
+; RV32ZICOND-NEXT:    add a0, a0, a3
+; RV32ZICOND-NEXT:    srai a3, a2, 31
+; RV32ZICOND-NEXT:    srai a6, a0, 31
+; RV32ZICOND-NEXT:    add a3, a3, a6
+; RV32ZICOND-NEXT:    neg a6, a1
+; RV32ZICOND-NEXT:    mulh a4, a1, a4
+; RV32ZICOND-NEXT:    srai a5, a5, 31
+; RV32ZICOND-NEXT:    add a0, a2, a0
+; RV32ZICOND-NEXT:    sltu a2, a0, a2
+; RV32ZICOND-NEXT:    sub a0, a0, a1
+; RV32ZICOND-NEXT:    add a2, a3, a2
+; RV32ZICOND-NEXT:    sltu a1, a0, a6
+; RV32ZICOND-NEXT:    add a2, a4, a2
+; RV32ZICOND-NEXT:    add a1, a2, a1
+; RV32ZICOND-NEXT:    xor a1, a1, a5
+; RV32ZICOND-NEXT:    xor a0, a0, a5
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    beqz a0, .LBB62_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow
+; RV32ZICOND-NEXT:    li a0, 0
+; RV32ZICOND-NEXT:    ret
 ; RV32ZICOND-NEXT:  .LBB62_2: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB62_3: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    j .LBB62_2
-; RV32ZICOND-NEXT:  .LBB62_4: # %overflow
-; RV32ZICOND-NEXT:    li a0, 0
-; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: smulo2.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -7066,11 +5079,7 @@ continue:
 
 define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-LABEL: umulo.br.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    beqz a1, .LBB64_4
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    beqz a3, .LBB64_6
-; RV32-NEXT:  # %bb.2: # %overflow1
+; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    mul a4, a3, a0
 ; RV32-NEXT:    mul a5, a1, a2
 ; RV32-NEXT:    mulhu a6, a0, a2
@@ -7087,45 +5096,13 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32-NEXT:    sltu a2, a4, a6
 ; RV32-NEXT:    or a0, a1, a0
 ; RV32-NEXT:    or a0, a0, a2
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    bnez a0, .LBB64_8
-; RV32-NEXT:  .LBB64_3: # %continue
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB64_4: # %overflow.no.lhs
-; RV32-NEXT:    beqz a3, .LBB64_9
-; RV32-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32-NEXT:    mulhu a4, a0, a2
-; RV32-NEXT:    mul a2, a1, a2
-; RV32-NEXT:    add a2, a4, a2
-; RV32-NEXT:    mulhu a4, a0, a3
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, a4, a1
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, a2, a0
-; RV32-NEXT:    sltu a0, a0, a2
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    j .LBB64_7
-; RV32-NEXT:  .LBB64_6: # %overflow.no.rhs.only
-; RV32-NEXT:    mulhu a4, a2, a0
-; RV32-NEXT:    mul a0, a3, a0
-; RV32-NEXT:    add a0, a4, a0
-; RV32-NEXT:    mulhu a4, a2, a1
-; RV32-NEXT:    mul a3, a3, a1
-; RV32-NEXT:    add a3, a4, a3
-; RV32-NEXT:    mul a1, a2, a1
-; RV32-NEXT:    add a1, a0, a1
-; RV32-NEXT:    sltu a0, a1, a0
-; RV32-NEXT:    add a0, a3, a0
-; RV32-NEXT:  .LBB64_7: # %overflow.no.rhs.only
-; RV32-NEXT:    snez a0, a0
-; RV32-NEXT:    andi a0, a0, 1
-; RV32-NEXT:    beqz a0, .LBB64_3
-; RV32-NEXT:  .LBB64_8: # %overflow
+; RV32-NEXT:    beqz a0, .LBB64_2
+; RV32-NEXT:  # %bb.1: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
-; RV32-NEXT:  .LBB64_9: # %overflow.no
-; RV32-NEXT:    j .LBB64_3
+; RV32-NEXT:  .LBB64_2: # %continue
+; RV32-NEXT:    li a0, 1
+; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: umulo.br.i64:
 ; RV64:       # %bb.0: # %entry
@@ -7139,11 +5116,7 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo.br.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    beqz a1, .LBB64_4
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB64_6
-; RV32ZBA-NEXT:  # %bb.2: # %overflow1
+; RV32ZBA:       # %bb.0: # %entry
 ; RV32ZBA-NEXT:    mul a4, a3, a0
 ; RV32ZBA-NEXT:    mul a5, a1, a2
 ; RV32ZBA-NEXT:    mulhu a6, a0, a2
@@ -7160,45 +5133,13 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZBA-NEXT:    sltu a2, a4, a6
 ; RV32ZBA-NEXT:    or a0, a1, a0
 ; RV32ZBA-NEXT:    or a0, a0, a2
-; RV32ZBA-NEXT:    andi a0, a0, 1
-; RV32ZBA-NEXT:    bnez a0, .LBB64_8
-; RV32ZBA-NEXT:  .LBB64_3: # %continue
-; RV32ZBA-NEXT:    li a0, 1
-; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB64_4: # %overflow.no.lhs
-; RV32ZBA-NEXT:    beqz a3, .LBB64_9
-; RV32ZBA-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32ZBA-NEXT:    mulhu a4, a0, a2
-; RV32ZBA-NEXT:    mul a2, a1, a2
-; RV32ZBA-NEXT:    add a2, a4, a2
-; RV32ZBA-NEXT:    mulhu a4, a0, a3
-; RV32ZBA-NEXT:    mul a1, a1, a3
-; RV32ZBA-NEXT:    add a1, a4, a1
-; RV32ZBA-NEXT:    mul a0, a0, a3
-; RV32ZBA-NEXT:    add a0, a2, a0
-; RV32ZBA-NEXT:    sltu a0, a0, a2
-; RV32ZBA-NEXT:    add a0, a1, a0
-; RV32ZBA-NEXT:    j .LBB64_7
-; RV32ZBA-NEXT:  .LBB64_6: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    mulhu a4, a2, a0
-; RV32ZBA-NEXT:    mul a0, a3, a0
-; RV32ZBA-NEXT:    add a0, a4, a0
-; RV32ZBA-NEXT:    mulhu a4, a2, a1
-; RV32ZBA-NEXT:    mul a3, a3, a1
-; RV32ZBA-NEXT:    add a3, a4, a3
-; RV32ZBA-NEXT:    mul a1, a2, a1
-; RV32ZBA-NEXT:    add a1, a0, a1
-; RV32ZBA-NEXT:    sltu a0, a1, a0
-; RV32ZBA-NEXT:    add a0, a3, a0
-; RV32ZBA-NEXT:  .LBB64_7: # %overflow.no.rhs.only
-; RV32ZBA-NEXT:    snez a0, a0
-; RV32ZBA-NEXT:    andi a0, a0, 1
-; RV32ZBA-NEXT:    beqz a0, .LBB64_3
-; RV32ZBA-NEXT:  .LBB64_8: # %overflow
+; RV32ZBA-NEXT:    beqz a0, .LBB64_2
+; RV32ZBA-NEXT:  # %bb.1: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
-; RV32ZBA-NEXT:  .LBB64_9: # %overflow.no
-; RV32ZBA-NEXT:    j .LBB64_3
+; RV32ZBA-NEXT:  .LBB64_2: # %continue
+; RV32ZBA-NEXT:    li a0, 1
+; RV32ZBA-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: umulo.br.i64:
 ; RV64ZBA:       # %bb.0: # %entry
@@ -7212,11 +5153,7 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo.br.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    beqz a1, .LBB64_4
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB64_6
-; RV32ZICOND-NEXT:  # %bb.2: # %overflow1
+; RV32ZICOND:       # %bb.0: # %entry
 ; RV32ZICOND-NEXT:    mul a4, a3, a0
 ; RV32ZICOND-NEXT:    mul a5, a1, a2
 ; RV32ZICOND-NEXT:    mulhu a6, a0, a2
@@ -7233,45 +5170,13 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
 ; RV32ZICOND-NEXT:    sltu a2, a4, a6
 ; RV32ZICOND-NEXT:    or a0, a1, a0
 ; RV32ZICOND-NEXT:    or a0, a0, a2
-; RV32ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZICOND-NEXT:    bnez a0, .LBB64_8
-; RV32ZICOND-NEXT:  .LBB64_3: # %continue
-; RV32ZICOND-NEXT:    li a0, 1
-; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB64_4: # %overflow.no.lhs
-; RV32ZICOND-NEXT:    beqz a3, .LBB64_9
-; RV32ZICOND-NEXT:  # %bb.5: # %overflow.no.lhs.only
-; RV32ZICOND-NEXT:    mulhu a4, a0, a2
-; RV32ZICOND-NEXT:    mul a2, a1, a2
-; RV32ZICOND-NEXT:    add a2, a4, a2
-; RV32ZICOND-NEXT:    mulhu a4, a0, a3
-; RV32ZICOND-NEXT:    mul a1, a1, a3
-; RV32ZICOND-NEXT:    add a1, a4, a1
-; RV32ZICOND-NEXT:    mul a0, a0, a3
-; RV32ZICOND-NEXT:    add a0, a2, a0
-; RV32ZICOND-NEXT:    sltu a0, a0, a2
-; RV32ZICOND-NEXT:    add a0, a1, a0
-; RV32ZICOND-NEXT:    j .LBB64_7
-; RV32ZICOND-NEXT:  .LBB64_6: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    mulhu a4, a2, a0
-; RV32ZICOND-NEXT:    mul a0, a3, a0
-; RV32ZICOND-NEXT:    add a0, a4, a0
-; RV32ZICOND-NEXT:    mulhu a4, a2, a1
-; RV32ZICOND-NEXT:    mul a3, a3, a1
-; RV32ZICOND-NEXT:    add a3, a4, a3
-; RV32ZICOND-NEXT:    mul a1, a2, a1
-; RV32ZICOND-NEXT:    add a1, a0, a1
-; RV32ZICOND-NEXT:    sltu a0, a1, a0
-; RV32ZICOND-NEXT:    add a0, a3, a0
-; RV32ZICOND-NEXT:  .LBB64_7: # %overflow.no.rhs.only
-; RV32ZICOND-NEXT:    snez a0, a0
-; RV32ZICOND-NEXT:    andi a0, a0, 1
-; RV32ZICOND-NEXT:    beqz a0, .LBB64_3
-; RV32ZICOND-NEXT:  .LBB64_8: # %overflow
+; RV32ZICOND-NEXT:    beqz a0, .LBB64_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB64_9: # %overflow.no
-; RV32ZICOND-NEXT:    j .LBB64_3
+; RV32ZICOND-NEXT:  .LBB64_2: # %continue
+; RV32ZICOND-NEXT:    li a0, 1
+; RV32ZICOND-NEXT:    ret
 ;
 ; RV64ZICOND-LABEL: umulo.br.i64:
 ; RV64ZICOND:       # %bb.0: # %entry
@@ -7298,13 +5203,16 @@ continue:
 
 define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV32-LABEL: umulo2.br.i64:
-; RV32:       # %bb.0: # %overflow.entry
-; RV32-NEXT:    beqz a1, .LBB65_2
-; RV32-NEXT:  # %bb.1: # %overflow.lhs
-; RV32-NEXT:    srli a1, a1, 31
-; RV32-NEXT:  .LBB65_2: # %overflow.res
-; RV32-NEXT:    andi a1, a1, 1
-; RV32-NEXT:    beqz a1, .LBB65_4
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    add a2, a0, a0
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    add a2, a1, a1
+; RV32-NEXT:    add a2, a2, a0
+; RV32-NEXT:    beq a2, a1, .LBB65_2
+; RV32-NEXT:  # %bb.1: # %entry
+; RV32-NEXT:    sltu a0, a2, a1
+; RV32-NEXT:  .LBB65_2: # %entry
+; RV32-NEXT:    beqz a0, .LBB65_4
 ; RV32-NEXT:  # %bb.3: # %overflow
 ; RV32-NEXT:    li a0, 0
 ; RV32-NEXT:    ret
@@ -7324,13 +5232,16 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: umulo2.br.i64:
-; RV32ZBA:       # %bb.0: # %overflow.entry
-; RV32ZBA-NEXT:    beqz a1, .LBB65_2
-; RV32ZBA-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZBA-NEXT:    srli a1, a1, 31
-; RV32ZBA-NEXT:  .LBB65_2: # %overflow.res
-; RV32ZBA-NEXT:    andi a1, a1, 1
-; RV32ZBA-NEXT:    beqz a1, .LBB65_4
+; RV32ZBA:       # %bb.0: # %entry
+; RV32ZBA-NEXT:    add a2, a0, a0
+; RV32ZBA-NEXT:    sltu a0, a2, a0
+; RV32ZBA-NEXT:    add a2, a1, a1
+; RV32ZBA-NEXT:    add a2, a2, a0
+; RV32ZBA-NEXT:    beq a2, a1, .LBB65_2
+; RV32ZBA-NEXT:  # %bb.1: # %entry
+; RV32ZBA-NEXT:    sltu a0, a2, a1
+; RV32ZBA-NEXT:  .LBB65_2: # %entry
+; RV32ZBA-NEXT:    beqz a0, .LBB65_4
 ; RV32ZBA-NEXT:  # %bb.3: # %overflow
 ; RV32ZBA-NEXT:    li a0, 0
 ; RV32ZBA-NEXT:    ret
@@ -7350,17 +5261,21 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) {
 ; RV64ZBA-NEXT:    ret
 ;
 ; RV32ZICOND-LABEL: umulo2.br.i64:
-; RV32ZICOND:       # %bb.0: # %overflow.entry
-; RV32ZICOND-NEXT:    beqz a1, .LBB65_2
-; RV32ZICOND-NEXT:  # %bb.1: # %overflow.lhs
-; RV32ZICOND-NEXT:    srli a1, a1, 31
-; RV32ZICOND-NEXT:  .LBB65_2: # %overflow.res
-; RV32ZICOND-NEXT:    andi a1, a1, 1
-; RV32ZICOND-NEXT:    beqz a1, .LBB65_4
-; RV32ZICOND-NEXT:  # %bb.3: # %overflow
+; RV32ZICOND:       # %bb.0: # %entry
+; RV32ZICOND-NEXT:    add a2, a0, a0
+; RV32ZICOND-NEXT:    add a3, a1, a1
+; RV32ZICOND-NEXT:    sltu a0, a2, a0
+; RV32ZICOND-NEXT:    add a3, a3, a0
+; RV32ZICOND-NEXT:    xor a2, a3, a1
+; RV32ZICOND-NEXT:    sltu a1, a3, a1
+; RV32ZICOND-NEXT:    czero.eqz a1, a1, a2
+; RV32ZICOND-NEXT:    czero.nez a0, a0, a2
+; RV32ZICOND-NEXT:    or a0, a0, a1
+; RV32ZICOND-NEXT:    beqz a0, .LBB65_2
+; RV32ZICOND-NEXT:  # %bb.1: # %overflow
 ; RV32ZICOND-NEXT:    li a0, 0
 ; RV32ZICOND-NEXT:    ret
-; RV32ZICOND-NEXT:  .LBB65_4: # %continue
+; RV32ZICOND-NEXT:  .LBB65_2: # %continue
 ; RV32ZICOND-NEXT:    li a0, 1
 ; RV32ZICOND-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
index ff846adf7e138..1e5ab7922de08 100644
--- a/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -5,106 +5,93 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:       ! %bb.0: ! %overflow.entry
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    ld [%fp+96], %g3
-; SPARC-NEXT:    ld [%fp+92], %l0
-; SPARC-NEXT:    sra %i2, 31, %g2
-; SPARC-NEXT:    xor %i0, %g2, %g4
-; SPARC-NEXT:    xor %i1, %g2, %g2
-; SPARC-NEXT:    or %g2, %g4, %g2
-; SPARC-NEXT:    cmp %g2, 0
-; SPARC-NEXT:    sra %l0, 31, %g2
-; SPARC-NEXT:    xor %i4, %g2, %g4
-; SPARC-NEXT:    xor %i5, %g2, %g2
-; SPARC-NEXT:    be .LBB0_4
-; SPARC-NEXT:    or %g2, %g4, %g2
-; SPARC-NEXT:  ! %bb.1: ! %overflow.lhs
-; SPARC-NEXT:    cmp %g2, 0
-; SPARC-NEXT:    be .LBB0_15
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.2: ! %overflow
-; SPARC-NEXT:    umul %i1, %g3, %l1
-; SPARC-NEXT:    rd %y, %g2
-; SPARC-NEXT:    umul %i0, %g3, %g4
-; SPARC-NEXT:    rd %y, %l2
-; SPARC-NEXT:    addcc %g4, %g2, %g2
-; SPARC-NEXT:    addxcc %l2, 0, %g4
-; SPARC-NEXT:    umul %i1, %l0, %l2
+; SPARC-NEXT:    ld [%fp+96], %l2
+; SPARC-NEXT:    mov %i3, %g2
+; SPARC-NEXT:    mov %i2, %g3
+; SPARC-NEXT:    umul %i1, %l2, %l0
+; SPARC-NEXT:    rd %y, %i2
+; SPARC-NEXT:    ld [%fp+92], %l1
+; SPARC-NEXT:    umul %i0, %l2, %i3
+; SPARC-NEXT:    rd %y, %g4
+; SPARC-NEXT:    addcc %i3, %i2, %i2
+; SPARC-NEXT:    addxcc %g4, 0, %i3
+; SPARC-NEXT:    umul %i1, %l1, %g4
+; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    addcc %g4, %i2, %l4
+; SPARC-NEXT:    addxcc %l3, 0, %i2
+; SPARC-NEXT:    addcc %i3, %i2, %i2
+; SPARC-NEXT:    addxcc %g0, 0, %i3
+; SPARC-NEXT:    umul %i0, %l1, %g4
 ; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addcc %l2, %g2, %l2
-; SPARC-NEXT:    addxcc %l3, 0, %g2
-; SPARC-NEXT:    addcc %g4, %g2, %g2
-; SPARC-NEXT:    addxcc %g0, 0, %l3
-; SPARC-NEXT:    umul %i0, %l0, %g4
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %g4, %g2, %g2
+; SPARC-NEXT:    addcc %g4, %i2, %i2
 ; SPARC-NEXT:    sra %i0, 31, %g4
-; SPARC-NEXT:    smul %l0, %g4, %l5
-; SPARC-NEXT:    umul %g3, %g4, %l6
+; SPARC-NEXT:    smul %l1, %g4, %l5
+; SPARC-NEXT:    umul %l2, %g4, %l6
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addxcc %l4, %l3, %l3
-; SPARC-NEXT:    add %l7, %l6, %l4
-; SPARC-NEXT:    add %l4, %l5, %l4
-; SPARC-NEXT:    addcc %g2, %l6, %l5
-; SPARC-NEXT:    umul %i3, %g3, %g2
-; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    addxcc %l3, %l4, %l3
-; SPARC-NEXT:    umul %i2, %g3, %g3
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %g3, %l6, %g3
-; SPARC-NEXT:    addxcc %l4, 0, %l4
-; SPARC-NEXT:    umul %i3, %l0, %l6
+; SPARC-NEXT:    addxcc %l3, %i3, %l3
+; SPARC-NEXT:    add %l7, %l6, %i3
+; SPARC-NEXT:    add %i3, %l5, %l5
+; SPARC-NEXT:    addcc %i2, %l6, %l6
+; SPARC-NEXT:    umul %g2, %l2, %i3
+; SPARC-NEXT:    rd %y, %i2
+; SPARC-NEXT:    addxcc %l3, %l5, %l3
+; SPARC-NEXT:    umul %g3, %l2, %l2
+; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    addcc %l2, %i2, %i2
+; SPARC-NEXT:    addxcc %l5, 0, %l2
+; SPARC-NEXT:    umul %g2, %l1, %l5
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l6, %g3, %g3
-; SPARC-NEXT:    addxcc %l7, 0, %l6
-; SPARC-NEXT:    addcc %l4, %l6, %l4
-; SPARC-NEXT:    addxcc %g0, 0, %l6
-; SPARC-NEXT:    umul %i2, %l0, %l0
+; SPARC-NEXT:    addcc %l5, %i2, %i2
+; SPARC-NEXT:    addxcc %l7, 0, %l5
+; SPARC-NEXT:    addcc %l2, %l5, %l2
+; SPARC-NEXT:    addxcc %g0, 0, %l5
+; SPARC-NEXT:    umul %g3, %l1, %l1
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l0, %l4, %l0
-; SPARC-NEXT:    addxcc %l7, %l6, %l4
-; SPARC-NEXT:    addcc %l1, %l0, %l0
-; SPARC-NEXT:    addxcc %l2, %l4, %l1
-; SPARC-NEXT:    addxcc %l5, 0, %l2
-; SPARC-NEXT:    umul %i2, %i5, %l4
-; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    addcc %l1, %l2, %l1
+; SPARC-NEXT:    addxcc %l7, %l5, %l2
+; SPARC-NEXT:    addcc %l0, %l1, %l0
+; SPARC-NEXT:    addxcc %l4, %l2, %l1
+; SPARC-NEXT:    addxcc %l6, 0, %l2
 ; SPARC-NEXT:    addxcc %l3, 0, %l3
-; SPARC-NEXT:    umul %i3, %i5, %l6
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    sra %l3, 31, %o0
-; SPARC-NEXT:    addcc %l4, %l7, %l4
-; SPARC-NEXT:    addxcc %l5, 0, %l5
-; SPARC-NEXT:    umul %i3, %i4, %l7
+; SPARC-NEXT:    umul %g2, %i5, %l4
+; SPARC-NEXT:    rd %y, %l5
+; SPARC-NEXT:    sra %l3, 31, %l6
+; SPARC-NEXT:    umul %g3, %i5, %l7
+; SPARC-NEXT:    rd %y, %o0
+; SPARC-NEXT:    addcc %l7, %l5, %l5
+; SPARC-NEXT:    addxcc %o0, 0, %l7
+; SPARC-NEXT:    umul %g2, %i4, %o0
 ; SPARC-NEXT:    rd %y, %o1
-; SPARC-NEXT:    addcc %l7, %l4, %l4
-; SPARC-NEXT:    addxcc %o1, 0, %l7
-; SPARC-NEXT:    addcc %l5, %l7, %l5
-; SPARC-NEXT:    addxcc %g0, 0, %l7
-; SPARC-NEXT:    umul %i2, %i4, %o1
+; SPARC-NEXT:    addcc %o0, %l5, %l5
+; SPARC-NEXT:    addxcc %o1, 0, %o0
+; SPARC-NEXT:    addcc %l7, %o0, %l7
+; SPARC-NEXT:    addxcc %g0, 0, %o0
+; SPARC-NEXT:    umul %g3, %i4, %o1
 ; SPARC-NEXT:    rd %y, %o2
-; SPARC-NEXT:    addcc %o1, %l5, %l5
+; SPARC-NEXT:    addcc %o1, %l7, %l7
 ; SPARC-NEXT:    sra %i4, 31, %o1
-; SPARC-NEXT:    smul %o1, %i2, %i2
-; SPARC-NEXT:    umul %o1, %i3, %i3
+; SPARC-NEXT:    smul %o1, %g3, %g3
+; SPARC-NEXT:    umul %o1, %g2, %g2
 ; SPARC-NEXT:    rd %y, %o3
-; SPARC-NEXT:    addxcc %o2, %l7, %l7
-; SPARC-NEXT:    add %o3, %i2, %i2
-; SPARC-NEXT:    add %i2, %i3, %i2
-; SPARC-NEXT:    addcc %l5, %i3, %i3
-; SPARC-NEXT:    addxcc %l7, %i2, %l5
-; SPARC-NEXT:    addcc %l6, %l0, %i2
-; SPARC-NEXT:    addxcc %l4, %l1, %l0
-; SPARC-NEXT:    addxcc %i3, 0, %i3
-; SPARC-NEXT:    addxcc %l5, 0, %l1
+; SPARC-NEXT:    addxcc %o2, %o0, %o0
+; SPARC-NEXT:    add %o3, %g3, %g3
+; SPARC-NEXT:    add %g3, %g2, %g3
+; SPARC-NEXT:    addcc %l7, %g2, %l7
+; SPARC-NEXT:    addxcc %o0, %g3, %o0
+; SPARC-NEXT:    addcc %l4, %l0, %g2
+; SPARC-NEXT:    addxcc %l5, %l1, %g3
+; SPARC-NEXT:    addxcc %l7, 0, %l0
+; SPARC-NEXT:    addxcc %o0, 0, %l1
 ; SPARC-NEXT:    sra %l1, 31, %l4
-; SPARC-NEXT:    addcc %l2, %i3, %i3
+; SPARC-NEXT:    addcc %l2, %l0, %l0
 ; SPARC-NEXT:    addxcc %l3, %l1, %l1
-; SPARC-NEXT:    addxcc %o0, %l4, %l2
+; SPARC-NEXT:    addxcc %l6, %l4, %l2
 ; SPARC-NEXT:    smul %i4, %g4, %l3
 ; SPARC-NEXT:    umul %i5, %g4, %g4
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addxcc %o0, %l4, %l4
+; SPARC-NEXT:    addxcc %l6, %l4, %l4
 ; SPARC-NEXT:    add %l5, %g4, %l5
 ; SPARC-NEXT:    smul %o1, %i0, %l6
 ; SPARC-NEXT:    umul %o1, %i1, %l7
@@ -126,1050 +113,150 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-NEXT:    addxcc %l7, 0, %i5
 ; SPARC-NEXT:    addcc %l5, %i5, %i5
 ; SPARC-NEXT:    addxcc %g0, 0, %l5
-; SPARC-NEXT:    umul %i0, %i4, %i4
-; SPARC-NEXT:    mov %l0, %i0
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    addcc %i4, %i5, %i4
-; SPARC-NEXT:    addxcc %l0, %l5, %i5
-; SPARC-NEXT:    addcc %i4, %g4, %i4
-; SPARC-NEXT:    addxcc %i5, %l3, %i5
-; SPARC-NEXT:    addcc %l6, %i3, %i3
+; SPARC-NEXT:    umul %i0, %i4, %i0
+; SPARC-NEXT:    rd %y, %i4
+; SPARC-NEXT:    addcc %i0, %i5, %i0
+; SPARC-NEXT:    addxcc %i4, %l5, %i4
+; SPARC-NEXT:    addcc %i0, %g4, %i0
+; SPARC-NEXT:    addxcc %i4, %l3, %i4
+; SPARC-NEXT:    addcc %l6, %l0, %i5
 ; SPARC-NEXT:    addxcc %i1, %l1, %i1
-; SPARC-NEXT:    addxcc %i4, %l2, %i4
-; SPARC-NEXT:    addxcc %i5, %l4, %i5
-; SPARC-NEXT:    sra %i0, 31, %g4
-; SPARC-NEXT:    xor %i5, %g4, %i5
-; SPARC-NEXT:    xor %i1, %g4, %i1
-; SPARC-NEXT:    or %i1, %i5, %i1
+; SPARC-NEXT:    addxcc %i0, %l2, %i0
+; SPARC-NEXT:    addxcc %i4, %l4, %i4
+; SPARC-NEXT:    sra %g3, 31, %g4
 ; SPARC-NEXT:    xor %i4, %g4, %i4
-; SPARC-NEXT:    xor %i3, %g4, %i3
-; SPARC-NEXT:    or %i3, %i4, %i3
-; SPARC-NEXT:    or %i3, %i1, %i1
-; SPARC-NEXT:    cmp %i1, 0
-; SPARC-NEXT:    bne .LBB0_110
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.3: ! %overflow
-; SPARC-NEXT:    ba .LBB0_111
-; SPARC-NEXT:    mov %g0, %g4
-; SPARC-NEXT:  .LBB0_4: ! %overflow.no.lhs
-; SPARC-NEXT:    cmp %g2, 0
-; SPARC-NEXT:    be .LBB0_25
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.5: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov 1, %g4
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_7
-; SPARC-NEXT:    mov %g4, %g2
-; SPARC-NEXT:  ! %bb.6: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %g2
-; SPARC-NEXT:  .LBB0_7: ! %overflow.no.lhs.only
-; SPARC-NEXT:    subcc %g0, %i3, %l4
-; SPARC-NEXT:    subxcc %g0, %i2, %l3
-; SPARC-NEXT:    subxcc %g0, %i1, %l1
-; SPARC-NEXT:    subxcc %g0, %i0, %l2
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_26
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.8: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i3, %l4
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_27
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_9: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_28
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_10: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i0, %l2
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_29
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_11: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_30
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_12: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i1, %l1
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_31
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_13: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_32
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_14: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i2, %l3
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_33
-; SPARC-NEXT:    nop
-; SPARC-NEXT:    ba .LBB0_34
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_15: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov 1, %g4
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_17
-; SPARC-NEXT:    mov %g4, %g2
-; SPARC-NEXT:  ! %bb.16: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %g2
-; SPARC-NEXT:  .LBB0_17: ! %overflow.no.rhs.only
-; SPARC-NEXT:    subcc %g0, %g3, %l4
-; SPARC-NEXT:    subxcc %g0, %l0, %l3
-; SPARC-NEXT:    subxcc %g0, %i5, %l1
-; SPARC-NEXT:    subxcc %g0, %i4, %l2
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_44
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.18: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g3, %l4
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_45
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_19: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_46
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_20: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i4, %l2
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_47
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_21: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_48
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_22: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i5, %l1
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_49
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_23: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_50
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_24: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %l0, %l3
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_51
-; SPARC-NEXT:    nop
-; SPARC-NEXT:    ba .LBB0_52
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_25: ! %overflow.no
-; SPARC-NEXT:    smul %g3, %i0, %g2
-; SPARC-NEXT:    umul %g3, %i1, %i0
-; SPARC-NEXT:    rd %y, %l1
-; SPARC-NEXT:    mov %g0, %g4
-; SPARC-NEXT:    add %l1, %g2, %g2
-; SPARC-NEXT:    smul %l0, %i1, %i1
-; SPARC-NEXT:    smul %i5, %i2, %l1
-; SPARC-NEXT:    umul %i5, %i3, %i5
-; SPARC-NEXT:    rd %y, %l2
-; SPARC-NEXT:    add %g2, %i1, %i1
-; SPARC-NEXT:    add %l2, %l1, %g2
-; SPARC-NEXT:    smul %i4, %i3, %i4
-; SPARC-NEXT:    add %g2, %i4, %i4
-; SPARC-NEXT:    addcc %i5, %i0, %i0
-; SPARC-NEXT:    umul %i3, %g3, %g2
-; SPARC-NEXT:    rd %y, %i5
-; SPARC-NEXT:    addxcc %i4, %i1, %i4
-; SPARC-NEXT:    umul %i2, %g3, %i1
-; SPARC-NEXT:    rd %y, %g3
-; SPARC-NEXT:    addcc %i1, %i5, %i1
-; SPARC-NEXT:    addxcc %g3, 0, %i5
-; SPARC-NEXT:    umul %i3, %l0, %i3
-; SPARC-NEXT:    rd %y, %l1
-; SPARC-NEXT:    addcc %i3, %i1, %g3
-; SPARC-NEXT:    addxcc %l1, 0, %i1
-; SPARC-NEXT:    addcc %i5, %i1, %i1
-; SPARC-NEXT:    addxcc %g0, 0, %i3
-; SPARC-NEXT:    umul %i2, %l0, %i2
-; SPARC-NEXT:    rd %y, %i5
-; SPARC-NEXT:    addcc %i2, %i1, %i1
-; SPARC-NEXT:    addxcc %i5, %i3, %i2
-; SPARC-NEXT:    addcc %i1, %i0, %i1
-; SPARC-NEXT:    ba .LBB0_112
-; SPARC-NEXT:    addxcc %i2, %i4, %i0
-; SPARC-NEXT:  .LBB0_26: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_9
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_27: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i2, %l3
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_10
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_28: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_11
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_29: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i1, %l1
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_12
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_30: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_13
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_31: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i0, %l2
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_14
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_32: ! %overflow.no.lhs.only
+; SPARC-NEXT:    xor %i1, %g4, %i1
+; SPARC-NEXT:    or %i1, %i4, %i1
+; SPARC-NEXT:    xor %i0, %g4, %i0
+; SPARC-NEXT:    xor %i5, %g4, %i4
+; SPARC-NEXT:    or %i4, %i0, %i0
+; SPARC-NEXT:    or %i0, %i1, %i0
 ; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_34
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_33: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i3, %l4
-; SPARC-NEXT:  .LBB0_34: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_36
-; SPARC-NEXT:    mov %g4, %i0
-; SPARC-NEXT:  ! %bb.35: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %i0
-; SPARC-NEXT:  .LBB0_36: ! %overflow.no.lhs.only
-; SPARC-NEXT:    subcc %g0, %g3, %l6
-; SPARC-NEXT:    subxcc %g0, %l0, %l5
-; SPARC-NEXT:    subxcc %g0, %i5, %i2
-; SPARC-NEXT:    subxcc %g0, %i4, %i1
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_62
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.37: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g3, %l6
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_63
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_38: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_64
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_39: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i5, %i2
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_65
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_40: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_66
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_41: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i4, %i1
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_67
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_42: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_68
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_43: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %l0, %l5
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_69
-; SPARC-NEXT:    nop
-; SPARC-NEXT:    ba .LBB0_70
+; SPARC-NEXT:    bne .LBB0_2
 ; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_44: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_19
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_45: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %l0, %l3
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_20
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_46: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_21
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_47: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i5, %l1
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_22
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_48: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_23
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_49: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i4, %l2
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_24
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_50: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_52
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_51: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g3, %l4
-; SPARC-NEXT:  .LBB0_52: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_54
-; SPARC-NEXT:    mov %g4, %i4
-; SPARC-NEXT:  ! %bb.53: ! %overflow.no.rhs.only
+; SPARC-NEXT:  ! %bb.1: ! %start
+; SPARC-NEXT:    ba .LBB0_3
 ; SPARC-NEXT:    mov %g0, %i4
-; SPARC-NEXT:  .LBB0_54: ! %overflow.no.rhs.only
-; SPARC-NEXT:    subcc %g0, %i3, %l5
-; SPARC-NEXT:    subxcc %g0, %i2, %l0
-; SPARC-NEXT:    subxcc %g0, %i1, %g3
-; SPARC-NEXT:    subxcc %g0, %i0, %i5
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_85
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.55: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i3, %l5
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_86
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_56: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_87
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_57: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i1, %g3
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_88
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_58: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_89
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_59: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i0, %i5
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_90
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_60: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_91
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_61: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i2, %l0
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_92
-; SPARC-NEXT:    nop
-; SPARC-NEXT:    ba .LBB0_93
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_62: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_38
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_63: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %l0, %l5
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_39
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_64: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_40
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_65: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i4, %i1
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_41
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_66: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_42
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_67: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %i5, %i2
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bge .LBB0_43
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_68: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bl .LBB0_70
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_69: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g3, %l6
-; SPARC-NEXT:  .LBB0_70: ! %overflow.no.lhs.only
-; SPARC-NEXT:    umul %l4, %l6, %i3
-; SPARC-NEXT:    rd %y, %i4
-; SPARC-NEXT:    umul %l3, %l6, %i5
-; SPARC-NEXT:    rd %y, %g3
-; SPARC-NEXT:    addcc %i5, %i4, %i4
-; SPARC-NEXT:    addxcc %g3, 0, %i5
-; SPARC-NEXT:    umul %l4, %l5, %g3
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    addcc %g3, %i4, %i4
-; SPARC-NEXT:    addxcc %l0, 0, %g3
-; SPARC-NEXT:    addcc %i5, %g3, %i5
-; SPARC-NEXT:    addxcc %g0, 0, %g3
-; SPARC-NEXT:    umul %l3, %l5, %l0
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l0, %i5, %i5
-; SPARC-NEXT:    smul %l6, %l2, %l0
-; SPARC-NEXT:    umul %l6, %l1, %l6
-; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    addxcc %l7, %g3, %l7
-; SPARC-NEXT:    add %o0, %l0, %g3
-; SPARC-NEXT:    smul %l5, %l1, %l0
-; SPARC-NEXT:    add %g3, %l0, %l0
-; SPARC-NEXT:    addcc %i5, %l6, %g3
-; SPARC-NEXT:    umul %l4, %i2, %l5
-; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    addxcc %l7, %l0, %i5
-; SPARC-NEXT:    umul %l3, %i2, %l0
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l0, %l6, %l0
-; SPARC-NEXT:    addxcc %l7, 0, %l6
-; SPARC-NEXT:    umul %l4, %i1, %l4
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l4, %l0, %l4
-; SPARC-NEXT:    addxcc %l7, 0, %l0
-; SPARC-NEXT:    addcc %l6, %l0, %l0
-; SPARC-NEXT:    addxcc %g0, 0, %l6
-; SPARC-NEXT:    umul %l3, %i1, %l3
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l3, %l0, %l0
-; SPARC-NEXT:    smul %i2, %l2, %l2
-; SPARC-NEXT:    umul %i2, %l1, %i2
-; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addxcc %l7, %l6, %l6
-; SPARC-NEXT:    add %l3, %l2, %l2
-; SPARC-NEXT:    smul %i1, %l1, %i1
-; SPARC-NEXT:    add %l2, %i1, %i1
-; SPARC-NEXT:    addcc %l0, %i2, %l0
-; SPARC-NEXT:    addxcc %l6, %i1, %l1
-; SPARC-NEXT:    addcc %g3, %l5, %i1
-; SPARC-NEXT:    addxcc %i5, %l4, %i2
-; SPARC-NEXT:    cmp %i2, %i5
-; SPARC-NEXT:    bcs .LBB0_72
-; SPARC-NEXT:    mov %g4, %l2
-; SPARC-NEXT:  ! %bb.71: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %l2
-; SPARC-NEXT:  .LBB0_72: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i1, %g3
-; SPARC-NEXT:    bcs .LBB0_74
-; SPARC-NEXT:    mov %g4, %g3
-; SPARC-NEXT:  ! %bb.73: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %g3
-; SPARC-NEXT:  .LBB0_74: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i2, %i5
-; SPARC-NEXT:    be .LBB0_76
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.75: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %l2, %g3
-; SPARC-NEXT:  .LBB0_76: ! %overflow.no.lhs.only
-; SPARC-NEXT:    addcc %l0, %g3, %i5
-; SPARC-NEXT:    addxcc %l1, 0, %l0
-; SPARC-NEXT:    xor %i0, %g2, %i0
-; SPARC-NEXT:    sub %g0, %i0, %l1
-; SPARC-NEXT:    xor %i4, %l1, %i4
-; SPARC-NEXT:    xor %i3, %l1, %i3
-; SPARC-NEXT:    addcc %i3, %i0, %g2
-; SPARC-NEXT:    addxcc %i4, 0, %g3
-; SPARC-NEXT:    cmp %g2, %i0
-; SPARC-NEXT:    bcs .LBB0_78
-; SPARC-NEXT:    mov %g4, %i3
-; SPARC-NEXT:  ! %bb.77: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %i3
-; SPARC-NEXT:  .LBB0_78: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %g3, 0
-; SPARC-NEXT:    be .LBB0_80
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.79: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %i3
-; SPARC-NEXT:  .LBB0_80: ! %overflow.no.lhs.only
-; SPARC-NEXT:    xor %i1, %l1, %i0
-; SPARC-NEXT:    xor %i2, %l1, %i2
-; SPARC-NEXT:    addcc %i0, %i3, %i1
-; SPARC-NEXT:    addxcc %i2, 0, %i0
-; SPARC-NEXT:    cmp %i1, %i3
-; SPARC-NEXT:    bcs .LBB0_82
-; SPARC-NEXT:    mov %g4, %i2
-; SPARC-NEXT:  ! %bb.81: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %i2
-; SPARC-NEXT:  .LBB0_82: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    be .LBB0_84
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.83: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %i2
-; SPARC-NEXT:  .LBB0_84: ! %overflow.no.lhs.only
-; SPARC-NEXT:    xor %i5, %l1, %i3
-; SPARC-NEXT:    xor %l0, %l1, %i4
-; SPARC-NEXT:    addcc %i3, %i2, %i2
-; SPARC-NEXT:    ba .LBB0_108
-; SPARC-NEXT:    addxcc %i4, 0, %i3
-; SPARC-NEXT:  .LBB0_85: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_56
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_86: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i2, %l0
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_57
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_87: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_58
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_88: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i0, %i5
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_59
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_89: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_60
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_90: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i1, %g3
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bge .LBB0_61
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_91: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    bl .LBB0_93
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  .LBB0_92: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %i3, %l5
-; SPARC-NEXT:  .LBB0_93: ! %overflow.no.rhs.only
-; SPARC-NEXT:    umul %l4, %l5, %i0
-; SPARC-NEXT:    rd %y, %i1
-; SPARC-NEXT:    umul %l3, %l5, %i2
-; SPARC-NEXT:    rd %y, %i3
-; SPARC-NEXT:    addcc %i2, %i1, %i1
-; SPARC-NEXT:    addxcc %i3, 0, %i2
-; SPARC-NEXT:    umul %l4, %l0, %i3
-; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    addcc %i3, %i1, %i1
-; SPARC-NEXT:    addxcc %l6, 0, %i3
-; SPARC-NEXT:    addcc %i2, %i3, %i2
-; SPARC-NEXT:    addxcc %g0, 0, %i3
-; SPARC-NEXT:    umul %l3, %l0, %l6
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l6, %i2, %i2
-; SPARC-NEXT:    smul %l5, %l2, %l6
-; SPARC-NEXT:    umul %l5, %l1, %l5
-; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    addxcc %l7, %i3, %l7
-; SPARC-NEXT:    add %o0, %l6, %i3
-; SPARC-NEXT:    smul %l0, %l1, %l0
-; SPARC-NEXT:    add %i3, %l0, %l0
-; SPARC-NEXT:    addcc %i2, %l5, %i3
-; SPARC-NEXT:    umul %l4, %g3, %l5
-; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    addxcc %l7, %l0, %i2
-; SPARC-NEXT:    umul %l3, %g3, %l0
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l0, %l6, %l0
-; SPARC-NEXT:    addxcc %l7, 0, %l6
-; SPARC-NEXT:    umul %l4, %i5, %l4
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l4, %l0, %l0
-; SPARC-NEXT:    addxcc %l7, 0, %l4
-; SPARC-NEXT:    addcc %l6, %l4, %l4
-; SPARC-NEXT:    addxcc %g0, 0, %l6
-; SPARC-NEXT:    umul %l3, %i5, %l3
-; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    addcc %l3, %l4, %l3
-; SPARC-NEXT:    smul %g3, %l2, %l2
-; SPARC-NEXT:    umul %g3, %l1, %g3
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addxcc %l7, %l6, %l6
-; SPARC-NEXT:    add %l4, %l2, %l2
-; SPARC-NEXT:    smul %i5, %l1, %i5
-; SPARC-NEXT:    add %l2, %i5, %i5
-; SPARC-NEXT:    addcc %l3, %g3, %g3
-; SPARC-NEXT:    addxcc %l6, %i5, %l1
-; SPARC-NEXT:    addcc %i3, %l5, %i5
-; SPARC-NEXT:    addxcc %i2, %l0, %l0
-; SPARC-NEXT:    cmp %l0, %i2
-; SPARC-NEXT:    bcs .LBB0_95
-; SPARC-NEXT:    mov %g4, %l2
-; SPARC-NEXT:  ! %bb.94: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %l2
-; SPARC-NEXT:  .LBB0_95: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i5, %i3
-; SPARC-NEXT:    bcs .LBB0_97
-; SPARC-NEXT:    mov %g4, %i3
-; SPARC-NEXT:  ! %bb.96: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %i3
-; SPARC-NEXT:  .LBB0_97: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %l0, %i2
-; SPARC-NEXT:    be .LBB0_99
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.98: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %l2, %i3
-; SPARC-NEXT:  .LBB0_99: ! %overflow.no.rhs.only
-; SPARC-NEXT:    addcc %g3, %i3, %i2
-; SPARC-NEXT:    addxcc %l1, 0, %i3
-; SPARC-NEXT:    xor %g2, %i4, %l1
-; SPARC-NEXT:    sub %g0, %l1, %i4
-; SPARC-NEXT:    xor %i1, %i4, %i1
-; SPARC-NEXT:    xor %i0, %i4, %i0
-; SPARC-NEXT:    addcc %i0, %l1, %g2
-; SPARC-NEXT:    addxcc %i1, 0, %g3
-; SPARC-NEXT:    cmp %g2, %l1
-; SPARC-NEXT:    bcs .LBB0_101
-; SPARC-NEXT:    mov %g4, %l1
-; SPARC-NEXT:  ! %bb.100: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %l1
-; SPARC-NEXT:  .LBB0_101: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %g3, 0
-; SPARC-NEXT:    be .LBB0_103
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.102: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %l1
-; SPARC-NEXT:  .LBB0_103: ! %overflow.no.rhs.only
-; SPARC-NEXT:    xor %i5, %i4, %i0
-; SPARC-NEXT:    xor %l0, %i4, %i5
-; SPARC-NEXT:    addcc %i0, %l1, %i1
-; SPARC-NEXT:    addxcc %i5, 0, %i0
-; SPARC-NEXT:    cmp %i1, %l1
-; SPARC-NEXT:    bcs .LBB0_105
-; SPARC-NEXT:    mov %g4, %i5
-; SPARC-NEXT:  ! %bb.104: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %i5
-; SPARC-NEXT:  .LBB0_105: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i0, 0
-; SPARC-NEXT:    be .LBB0_107
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.106: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %i5
-; SPARC-NEXT:  .LBB0_107: ! %overflow.no.rhs.only
-; SPARC-NEXT:    xor %i2, %i4, %i2
-; SPARC-NEXT:    xor %i3, %i4, %i3
-; SPARC-NEXT:    addcc %i2, %i5, %i2
-; SPARC-NEXT:    addxcc %i3, 0, %i3
-; SPARC-NEXT:  .LBB0_108: ! %overflow.no.rhs.only
-; SPARC-NEXT:    or %i2, %i3, %i2
-; SPARC-NEXT:    cmp %i2, 0
-; SPARC-NEXT:    bne .LBB0_112
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.109: ! %overflow.no.rhs.only
-; SPARC-NEXT:    ba .LBB0_112
-; SPARC-NEXT:    mov %g0, %g4
-; SPARC-NEXT:  .LBB0_110:
-; SPARC-NEXT:    mov 1, %g4
-; SPARC-NEXT:  .LBB0_111: ! %overflow
-; SPARC-NEXT:    mov %i2, %i1
-; SPARC-NEXT:  .LBB0_112: ! %overflow.res
-; SPARC-NEXT:    and %g4, 1, %i4
-; SPARC-NEXT:    mov %g3, %i2
+; SPARC-NEXT:  .LBB0_2:
+; SPARC-NEXT:    mov 1, %i4
+; SPARC-NEXT:  .LBB0_3: ! %start
+; SPARC-NEXT:    mov %g3, %i0
 ; SPARC-NEXT:    ret
-; SPARC-NEXT:    restore %g0, %g2, %o3
+; SPARC-NEXT:    restore %g0, %g2, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
 ; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
-; SPARC64-NEXT:  ! %bb.0: ! %overflow.entry
+; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    mov %i1, %i4
-; SPARC64-NEXT:    srax %i1, 63, %i1
-; SPARC64-NEXT:    cmp %i0, %i1
-; SPARC64-NEXT:    be %xcc, .LBB0_3
-; SPARC64-NEXT:    srax %i3, 63, %i1
-; SPARC64-NEXT:  ! %bb.1: ! %overflow.lhs
-; SPARC64-NEXT:    cmp %i2, %i1
-; SPARC64-NEXT:    be %xcc, .LBB0_5
-; SPARC64-NEXT:    nop
-; SPARC64-NEXT:  ! %bb.2: ! %overflow
-; SPARC64-NEXT:    srax %i0, 63, %i5
-; SPARC64-NEXT:    mov %i5, %o0
+; SPARC64-NEXT:    mov %i3, %i4
+; SPARC64-NEXT:    mov %i1, %i5
+; SPARC64-NEXT:    mov %i0, %l2
+; SPARC64-NEXT:    srax %i0, 63, %i3
+; SPARC64-NEXT:    mov %i3, %o0
 ; SPARC64-NEXT:    mov %i0, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %i4, %o3
 ; SPARC64-NEXT:    mov %o0, %l0
 ; SPARC64-NEXT:    mov %o1, %l1
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i1, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
+; SPARC64-NEXT:    mov %i4, %o3
 ; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    add %l1, %o0, %l2
-; SPARC64-NEXT:    cmp %l2, %l1
-; SPARC64-NEXT:    movcs %xcc, 1, %i3
-; SPARC64-NEXT:    srl %i3, 0, %i3
-; SPARC64-NEXT:    add %l0, %i3, %l0
+; SPARC64-NEXT:    mov %g0, %i0
+; SPARC64-NEXT:    add %l1, %o0, %l3
+; SPARC64-NEXT:    cmp %l3, %l1
+; SPARC64-NEXT:    movcs %xcc, 1, %i0
+; SPARC64-NEXT:    srl %i0, 0, %i0
+; SPARC64-NEXT:    add %l0, %i0, %l0
 ; SPARC64-NEXT:    srax %l0, 63, %l1
-; SPARC64-NEXT:    srax %i2, 63, %i3
+; SPARC64-NEXT:    srax %i2, 63, %i4
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    mov %i3, %o2
+; SPARC64-NEXT:    mov %i5, %o1
+; SPARC64-NEXT:    mov %i4, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    mov %g0, %i4
+; SPARC64-NEXT:    mov %g0, %i5
 ; SPARC64-NEXT:    mov %g0, %g2
-; SPARC64-NEXT:    add %o1, %l2, %g3
-; SPARC64-NEXT:    cmp %g3, %o1
-; SPARC64-NEXT:    movcs %xcc, 1, %i4
-; SPARC64-NEXT:    srl %i4, 0, %i4
-; SPARC64-NEXT:    add %o0, %i4, %i4
-; SPARC64-NEXT:    srax %i4, 63, %g4
-; SPARC64-NEXT:    add %l1, %g4, %g4
-; SPARC64-NEXT:    add %l0, %i4, %i4
-; SPARC64-NEXT:    cmp %i4, %l0
+; SPARC64-NEXT:    add %o1, %l3, %i0
+; SPARC64-NEXT:    cmp %i0, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i5
+; SPARC64-NEXT:    srl %i5, 0, %i5
+; SPARC64-NEXT:    add %o0, %i5, %i5
+; SPARC64-NEXT:    srax %i5, 63, %g3
+; SPARC64-NEXT:    add %l1, %g3, %g3
+; SPARC64-NEXT:    add %l0, %i5, %i5
+; SPARC64-NEXT:    cmp %i5, %l0
 ; SPARC64-NEXT:    movcs %xcc, 1, %g2
 ; SPARC64-NEXT:    srl %g2, 0, %g2
-; SPARC64-NEXT:    add %g4, %g2, %l0
-; SPARC64-NEXT:    mov %i5, %o0
-; SPARC64-NEXT:    mov %i0, %o1
-; SPARC64-NEXT:    mov %g3, %i0
-; SPARC64-NEXT:    mov %i3, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    add %o0, %l0, %i5
-; SPARC64-NEXT:    add %o1, %i4, %i4
-; SPARC64-NEXT:    cmp %i4, %o1
-; SPARC64-NEXT:    movcs %xcc, 1, %i3
-; SPARC64-NEXT:    srl %i3, 0, %i3
-; SPARC64-NEXT:    add %i5, %i3, %i3
-; SPARC64-NEXT:    srax %i0, 63, %i5
-; SPARC64-NEXT:    xor %i3, %i5, %i3
-; SPARC64-NEXT:    xor %i4, %i5, %i4
-; SPARC64-NEXT:    ba .LBB0_7
-; SPARC64-NEXT:    or %i4, %i3, %i3
-; SPARC64-NEXT:  .LBB0_3: ! %overflow.no.lhs
-; SPARC64-NEXT:    cmp %i2, %i1
-; SPARC64-NEXT:    be %xcc, .LBB0_8
-; SPARC64-NEXT:    nop
-; SPARC64-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
-; SPARC64-NEXT:    mov %g0, %i5
-; SPARC64-NEXT:    mov %g0, %i1
-; SPARC64-NEXT:    mov %g0, %l0
-; SPARC64-NEXT:    mov %g0, %g2
-; SPARC64-NEXT:    movrnz %i4, 1, %i1
-; SPARC64-NEXT:    srl %i1, 0, %i1
-; SPARC64-NEXT:    add %i0, %i1, %i1
-; SPARC64-NEXT:    sub %g0, %i1, %i1
-; SPARC64-NEXT:    mov %i0, %g3
-; SPARC64-NEXT:    movrlz %i0, %i1, %g3
-; SPARC64-NEXT:    sub %g0, %i4, %i1
-; SPARC64-NEXT:    mov %i4, %g4
-; SPARC64-NEXT:    movrlz %i0, %i1, %g4
-; SPARC64-NEXT:    movrlz %i0, 1, %i5
-; SPARC64-NEXT:    movrlz %i0, %g4, %i4
-; SPARC64-NEXT:    movrlz %i0, %g3, %i0
-; SPARC64-NEXT:    movrlz %i2, 1, %l0
-; SPARC64-NEXT:    sub %g0, %i3, %i1
-; SPARC64-NEXT:    mov %i3, %g3
-; SPARC64-NEXT:    movrlz %i2, %i1, %g3
-; SPARC64-NEXT:    movrnz %i3, 1, %g2
-; SPARC64-NEXT:    srl %g2, 0, %i1
-; SPARC64-NEXT:    add %i2, %i1, %i1
-; SPARC64-NEXT:    sub %g0, %i1, %i1
-; SPARC64-NEXT:    mov %i2, %g2
-; SPARC64-NEXT:    movrlz %i2, %i1, %g2
-; SPARC64-NEXT:    movrlz %i2, %g3, %i3
-; SPARC64-NEXT:    movrlz %i2, %g2, %i2
-; SPARC64-NEXT:    mov %i0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    mov %g0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    mov %o0, %i1
-; SPARC64-NEXT:    mov %o1, %i3
-; SPARC64-NEXT:    mov %i0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    mov %g0, %o2
+; SPARC64-NEXT:    add %g3, %g2, %l0
+; SPARC64-NEXT:    mov %i3, %o0
+; SPARC64-NEXT:    mov %l2, %o1
+; SPARC64-NEXT:    mov %i4, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    mov %g0, %i0
-; SPARC64-NEXT:    mov %g0, %i4
-; SPARC64-NEXT:    mov %g0, %g2
 ; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    add %i1, %o1, %g3
-; SPARC64-NEXT:    cmp %g3, %i1
-; SPARC64-NEXT:    movcs %xcc, 1, %i0
-; SPARC64-NEXT:    srl %i0, 0, %i0
-; SPARC64-NEXT:    add %o0, %i0, %g4
-; SPARC64-NEXT:    xor %l0, %i5, %i0
-; SPARC64-NEXT:    and %i0, 1, %i1
-; SPARC64-NEXT:    sub %g0, %i1, %i5
-; SPARC64-NEXT:    srl %i0, 0, %i0
-; SPARC64-NEXT:    xor %i3, %i5, %i1
-; SPARC64-NEXT:    add %i1, %i0, %i1
-; SPARC64-NEXT:    cmp %i1, %i0
-; SPARC64-NEXT:    movcs %xcc, 1, %i4
-; SPARC64-NEXT:    ba .LBB0_6
-; SPARC64-NEXT:    srl %i4, 0, %i3
-; SPARC64-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
-; SPARC64-NEXT:    mov %g0, %i5
-; SPARC64-NEXT:    mov %g0, %i1
-; SPARC64-NEXT:    mov %g0, %l0
-; SPARC64-NEXT:    mov %g0, %g2
-; SPARC64-NEXT:    movrnz %i3, 1, %i1
-; SPARC64-NEXT:    srl %i1, 0, %i1
-; SPARC64-NEXT:    add %i2, %i1, %i1
-; SPARC64-NEXT:    sub %g0, %i1, %i1
-; SPARC64-NEXT:    mov %i2, %g3
-; SPARC64-NEXT:    movrlz %i2, %i1, %g3
-; SPARC64-NEXT:    sub %g0, %i3, %i1
-; SPARC64-NEXT:    mov %i3, %g4
-; SPARC64-NEXT:    movrlz %i2, %i1, %g4
-; SPARC64-NEXT:    movrlz %i2, 1, %i5
-; SPARC64-NEXT:    movrlz %i2, %g4, %i3
-; SPARC64-NEXT:    movrlz %i2, %g3, %i2
-; SPARC64-NEXT:    movrlz %i0, 1, %l0
-; SPARC64-NEXT:    sub %g0, %i4, %i1
-; SPARC64-NEXT:    mov %i4, %g3
-; SPARC64-NEXT:    movrlz %i0, %i1, %g3
-; SPARC64-NEXT:    movrnz %i4, 1, %g2
-; SPARC64-NEXT:    srl %g2, 0, %i1
-; SPARC64-NEXT:    add %i0, %i1, %i1
-; SPARC64-NEXT:    sub %g0, %i1, %i1
-; SPARC64-NEXT:    mov %i0, %g2
-; SPARC64-NEXT:    movrlz %i0, %i1, %g2
-; SPARC64-NEXT:    movrlz %i0, %g3, %i4
-; SPARC64-NEXT:    movrlz %i0, %g2, %i0
-; SPARC64-NEXT:    mov %i2, %o0
-; SPARC64-NEXT:    mov %i3, %o1
-; SPARC64-NEXT:    mov %g0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    mov %o0, %i1
-; SPARC64-NEXT:    mov %o1, %i4
-; SPARC64-NEXT:    mov %i2, %o0
-; SPARC64-NEXT:    mov %i3, %o1
-; SPARC64-NEXT:    mov %g0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i0, %o3
-; SPARC64-NEXT:    mov %g0, %i0
 ; SPARC64-NEXT:    mov %g0, %i3
-; SPARC64-NEXT:    mov %g0, %g2
-; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    add %i1, %o1, %g3
-; SPARC64-NEXT:    cmp %g3, %i1
-; SPARC64-NEXT:    movcs %xcc, 1, %i0
-; SPARC64-NEXT:    srl %i0, 0, %i0
-; SPARC64-NEXT:    add %o0, %i0, %g4
-; SPARC64-NEXT:    xor %i5, %l0, %i0
-; SPARC64-NEXT:    and %i0, 1, %i1
-; SPARC64-NEXT:    sub %g0, %i1, %i5
-; SPARC64-NEXT:    srl %i0, 0, %i0
-; SPARC64-NEXT:    xor %i4, %i5, %i1
-; SPARC64-NEXT:    add %i1, %i0, %i1
-; SPARC64-NEXT:    cmp %i1, %i0
-; SPARC64-NEXT:    movcs %xcc, 1, %i3
-; SPARC64-NEXT:    srl %i3, 0, %i3
-; SPARC64-NEXT:  .LBB0_6: ! %overflow.res
-; SPARC64-NEXT:    xor %g3, %i5, %i0
-; SPARC64-NEXT:    add %i0, %i3, %i0
-; SPARC64-NEXT:    cmp %i0, %i3
-; SPARC64-NEXT:    movcs %xcc, 1, %g2
-; SPARC64-NEXT:    srl %g2, 0, %i3
-; SPARC64-NEXT:    xor %g4, %i5, %i4
-; SPARC64-NEXT:    add %i4, %i3, %i3
-; SPARC64-NEXT:  .LBB0_7: ! %overflow.res
-; SPARC64-NEXT:    ba .LBB0_9
-; SPARC64-NEXT:    movrnz %i3, 1, %i2
-; SPARC64-NEXT:  .LBB0_8: ! %overflow.no
-; SPARC64-NEXT:    mov %i0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    mov %i2, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    mov %o0, %i0
-; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:  .LBB0_9: ! %overflow.res
-; SPARC64-NEXT:    and %i2, 1, %i2
+; SPARC64-NEXT:    add %o0, %l0, %i4
+; SPARC64-NEXT:    add %o1, %i5, %i5
+; SPARC64-NEXT:    cmp %i5, %o1
+; SPARC64-NEXT:    movcs %xcc, 1, %i2
+; SPARC64-NEXT:    srl %i2, 0, %i2
+; SPARC64-NEXT:    add %i4, %i2, %i2
+; SPARC64-NEXT:    srax %i0, 63, %i4
+; SPARC64-NEXT:    xor %i2, %i4, %i2
+; SPARC64-NEXT:    xor %i5, %i4, %i4
+; SPARC64-NEXT:    or %i4, %i2, %i2
+; SPARC64-NEXT:    movrnz %i2, 1, %i3
+; SPARC64-NEXT:    srl %i3, 0, %i2
 ; SPARC64-NEXT:    ret
 ; SPARC64-NEXT:    restore
 ;
 ; SPARC64-VIS3-LABEL: muloti_test:
 ; SPARC64-VIS3:         .register %g2, #scratch
 ; SPARC64-VIS3-NEXT:    .register %g3, #scratch
-; SPARC64-VIS3-NEXT:  ! %bb.0: ! %overflow.entry
+; SPARC64-VIS3-NEXT:  ! %bb.0: ! %start
 ; SPARC64-VIS3-NEXT:    save %sp, -128, %sp
-; SPARC64-VIS3-NEXT:    srax %i1, 63, %i4
-; SPARC64-VIS3-NEXT:    cmp %i0, %i4
-; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_3
-; SPARC64-VIS3-NEXT:    srax %i3, 63, %i4
-; SPARC64-VIS3-NEXT:  ! %bb.1: ! %overflow.lhs
-; SPARC64-VIS3-NEXT:    cmp %i2, %i4
-; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_5
-; SPARC64-VIS3-NEXT:    nop
-; SPARC64-VIS3-NEXT:  ! %bb.2: ! %overflow
-; SPARC64-VIS3-NEXT:    mov %g0, %i4
-; SPARC64-VIS3-NEXT:    srax %i0, 63, %i5
-; SPARC64-VIS3-NEXT:    mulx %i5, %i3, %g2
-; SPARC64-VIS3-NEXT:    umulxhi %i0, %i3, %g3
-; SPARC64-VIS3-NEXT:    add %g3, %g2, %g2
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
-; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %g4
-; SPARC64-VIS3-NEXT:    addcc %g4, %g3, %g3
-; SPARC64-VIS3-NEXT:    addxccc %g2, %g0, %g2
-; SPARC64-VIS3-NEXT:    srax %i2, 63, %g4
-; SPARC64-VIS3-NEXT:    mulx %i1, %g4, %g5
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %l0
-; SPARC64-VIS3-NEXT:    add %l0, %g5, %g5
-; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %l0
-; SPARC64-VIS3-NEXT:    addcc %l0, %g3, %g3
-; SPARC64-VIS3-NEXT:    addxccc %g5, %g0, %g5
-; SPARC64-VIS3-NEXT:    srax %g5, 63, %l0
-; SPARC64-VIS3-NEXT:    addcc %g2, %g5, %g5
-; SPARC64-VIS3-NEXT:    srax %g2, 63, %g2
-; SPARC64-VIS3-NEXT:    addxccc %g2, %l0, %g2
-; SPARC64-VIS3-NEXT:    and %g4, %i0, %g4
-; SPARC64-VIS3-NEXT:    and %i5, %i2, %i5
-; SPARC64-VIS3-NEXT:    add %i5, %g4, %i5
-; SPARC64-VIS3-NEXT:    umulxhi %i0, %i2, %g4
-; SPARC64-VIS3-NEXT:    sub %g4, %i5, %i5
-; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i2
-; SPARC64-VIS3-NEXT:    mov %g3, %i0
-; SPARC64-VIS3-NEXT:    addcc %i2, %g5, %i2
-; SPARC64-VIS3-NEXT:    addxccc %i5, %g2, %i5
-; SPARC64-VIS3-NEXT:    srax %g3, 63, %g2
-; SPARC64-VIS3-NEXT:    xor %i5, %g2, %i5
-; SPARC64-VIS3-NEXT:    xor %i2, %g2, %i2
-; SPARC64-VIS3-NEXT:    or %i2, %i5, %i2
-; SPARC64-VIS3-NEXT:    ba .LBB0_7
-; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
-; SPARC64-VIS3-NEXT:  .LBB0_3: ! %overflow.no.lhs
-; SPARC64-VIS3-NEXT:    cmp %i2, %i4
-; SPARC64-VIS3-NEXT:    be %xcc, .LBB0_6
-; SPARC64-VIS3-NEXT:    nop
-; SPARC64-VIS3-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
 ; SPARC64-VIS3-NEXT:    mov %g0, %i5
-; SPARC64-VIS3-NEXT:    mov %g0, %g3
-; SPARC64-VIS3-NEXT:    mov %g0, %g2
-; SPARC64-VIS3-NEXT:    mov %g0, %g4
-; SPARC64-VIS3-NEXT:    mov %g0, %g5
-; SPARC64-VIS3-NEXT:    mov %g0, %l0
-; SPARC64-VIS3-NEXT:    mov %g0, %l1
-; SPARC64-VIS3-NEXT:    mov %g0, %i4
-; SPARC64-VIS3-NEXT:    sub %g0, %i1, %l2
-; SPARC64-VIS3-NEXT:    mov %i1, %l3
-; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %l3
-; SPARC64-VIS3-NEXT:    movrnz %i1, 1, %g3
-; SPARC64-VIS3-NEXT:    srl %g3, 0, %g3
-; SPARC64-VIS3-NEXT:    add %i0, %g3, %g3
-; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
-; SPARC64-VIS3-NEXT:    mov %i0, %l2
-; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %l2
-; SPARC64-VIS3-NEXT:    movrlz %i0, 1, %i5
-; SPARC64-VIS3-NEXT:    movrlz %i0, %l3, %i1
-; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %i0
-; SPARC64-VIS3-NEXT:    sub %g0, %i3, %g3
-; SPARC64-VIS3-NEXT:    mov %i3, %l2
-; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %l2
-; SPARC64-VIS3-NEXT:    movrnz %i3, 1, %g4
-; SPARC64-VIS3-NEXT:    srl %g4, 0, %g3
-; SPARC64-VIS3-NEXT:    add %i2, %g3, %g3
-; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
-; SPARC64-VIS3-NEXT:    mov %i2, %g4
-; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %g4
-; SPARC64-VIS3-NEXT:    movrlz %i2, 1, %g2
-; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %i3
-; SPARC64-VIS3-NEXT:    movrlz %i2, %g4, %i2
+; SPARC64-VIS3-NEXT:    umulxhi %i0, %i3, %i4
+; SPARC64-VIS3-NEXT:    srax %i0, 63, %g2
+; SPARC64-VIS3-NEXT:    mulx %g2, %i3, %g3
+; SPARC64-VIS3-NEXT:    add %i4, %g3, %i4
 ; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
 ; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %g4
-; SPARC64-VIS3-NEXT:    add %g3, %g4, %g3
+; SPARC64-VIS3-NEXT:    addcc %g4, %g3, %g3
+; SPARC64-VIS3-NEXT:    addxccc %i4, %g0, %g4
+; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %i4
+; SPARC64-VIS3-NEXT:    srax %i2, 63, %g5
+; SPARC64-VIS3-NEXT:    mulx %i1, %g5, %l0
+; SPARC64-VIS3-NEXT:    add %i4, %l0, %l0
+; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i4
+; SPARC64-VIS3-NEXT:    addcc %i4, %g3, %i4
+; SPARC64-VIS3-NEXT:    addxccc %l0, %g0, %g3
+; SPARC64-VIS3-NEXT:    srax %g3, 63, %l0
+; SPARC64-VIS3-NEXT:    addcc %g4, %g3, %g3
+; SPARC64-VIS3-NEXT:    srax %g4, 63, %g4
+; SPARC64-VIS3-NEXT:    addxccc %g4, %l0, %g4
+; SPARC64-VIS3-NEXT:    and %g5, %i0, %g5
+; SPARC64-VIS3-NEXT:    and %g2, %i2, %g2
+; SPARC64-VIS3-NEXT:    add %g2, %g5, %g2
+; SPARC64-VIS3-NEXT:    umulxhi %i0, %i2, %g5
+; SPARC64-VIS3-NEXT:    sub %g5, %g2, %g2
 ; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %g4
-; SPARC64-VIS3-NEXT:    add %g4, %i0, %i0
-; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i3
-; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i1
-; SPARC64-VIS3-NEXT:    add %g3, %i1, %i2
-; SPARC64-VIS3-NEXT:    cmp %i2, %g3
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g5
-; SPARC64-VIS3-NEXT:    srl %g5, 0, %i1
-; SPARC64-VIS3-NEXT:    add %i0, %i1, %g3
-; SPARC64-VIS3-NEXT:    xor %g2, %i5, %i0
-; SPARC64-VIS3-NEXT:    and %i0, 1, %i1
-; SPARC64-VIS3-NEXT:    sub %g0, %i1, %i5
-; SPARC64-VIS3-NEXT:    srl %i0, 0, %i0
-; SPARC64-VIS3-NEXT:    xor %i3, %i5, %i1
-; SPARC64-VIS3-NEXT:    add %i1, %i0, %i1
-; SPARC64-VIS3-NEXT:    cmp %i1, %i0
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l0
-; SPARC64-VIS3-NEXT:    srl %l0, 0, %i3
-; SPARC64-VIS3-NEXT:    xor %i2, %i5, %i0
-; SPARC64-VIS3-NEXT:    add %i0, %i3, %i0
-; SPARC64-VIS3-NEXT:    cmp %i0, %i3
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l1
-; SPARC64-VIS3-NEXT:    srl %l1, 0, %i2
-; SPARC64-VIS3-NEXT:    xor %g3, %i5, %i3
-; SPARC64-VIS3-NEXT:    add %i3, %i2, %i2
-; SPARC64-VIS3-NEXT:    ba .LBB0_8
-; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
-; SPARC64-VIS3-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
-; SPARC64-VIS3-NEXT:    mov %g0, %i5
-; SPARC64-VIS3-NEXT:    mov %g0, %g3
-; SPARC64-VIS3-NEXT:    mov %g0, %g2
-; SPARC64-VIS3-NEXT:    mov %g0, %g4
-; SPARC64-VIS3-NEXT:    mov %g0, %g5
-; SPARC64-VIS3-NEXT:    mov %g0, %l0
-; SPARC64-VIS3-NEXT:    mov %g0, %l1
-; SPARC64-VIS3-NEXT:    mov %g0, %i4
-; SPARC64-VIS3-NEXT:    sub %g0, %i3, %l2
-; SPARC64-VIS3-NEXT:    mov %i3, %l3
-; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %l3
-; SPARC64-VIS3-NEXT:    movrnz %i3, 1, %g3
-; SPARC64-VIS3-NEXT:    srl %g3, 0, %g3
-; SPARC64-VIS3-NEXT:    add %i2, %g3, %g3
-; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
-; SPARC64-VIS3-NEXT:    mov %i2, %l2
-; SPARC64-VIS3-NEXT:    movrlz %i2, %g3, %l2
-; SPARC64-VIS3-NEXT:    movrlz %i2, 1, %i5
-; SPARC64-VIS3-NEXT:    movrlz %i2, %l3, %i3
-; SPARC64-VIS3-NEXT:    movrlz %i2, %l2, %i2
-; SPARC64-VIS3-NEXT:    sub %g0, %i1, %g3
-; SPARC64-VIS3-NEXT:    mov %i1, %l2
-; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %l2
-; SPARC64-VIS3-NEXT:    movrnz %i1, 1, %g4
-; SPARC64-VIS3-NEXT:    srl %g4, 0, %g3
-; SPARC64-VIS3-NEXT:    add %i0, %g3, %g3
-; SPARC64-VIS3-NEXT:    sub %g0, %g3, %g3
-; SPARC64-VIS3-NEXT:    mov %i0, %g4
-; SPARC64-VIS3-NEXT:    movrlz %i0, %g3, %g4
-; SPARC64-VIS3-NEXT:    movrlz %i0, 1, %g2
-; SPARC64-VIS3-NEXT:    movrlz %i0, %l2, %i1
-; SPARC64-VIS3-NEXT:    movrlz %i0, %g4, %i0
-; SPARC64-VIS3-NEXT:    umulxhi %i3, %i1, %g3
-; SPARC64-VIS3-NEXT:    mulx %i2, %i1, %g4
-; SPARC64-VIS3-NEXT:    add %g3, %g4, %g3
-; SPARC64-VIS3-NEXT:    mulx %i2, %i0, %i2
-; SPARC64-VIS3-NEXT:    umulxhi %i3, %i0, %g4
-; SPARC64-VIS3-NEXT:    add %g4, %i2, %i2
-; SPARC64-VIS3-NEXT:    mulx %i3, %i1, %i1
-; SPARC64-VIS3-NEXT:    mulx %i3, %i0, %i0
-; SPARC64-VIS3-NEXT:    add %g3, %i0, %i0
-; SPARC64-VIS3-NEXT:    cmp %i0, %g3
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g5
-; SPARC64-VIS3-NEXT:    srl %g5, 0, %i3
-; SPARC64-VIS3-NEXT:    add %i2, %i3, %i2
-; SPARC64-VIS3-NEXT:    xor %i5, %g2, %i3
-; SPARC64-VIS3-NEXT:    and %i3, 1, %i5
-; SPARC64-VIS3-NEXT:    sub %g0, %i5, %i5
-; SPARC64-VIS3-NEXT:    srl %i3, 0, %i3
-; SPARC64-VIS3-NEXT:    xor %i1, %i5, %i1
-; SPARC64-VIS3-NEXT:    add %i1, %i3, %i1
-; SPARC64-VIS3-NEXT:    cmp %i1, %i3
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l0
-; SPARC64-VIS3-NEXT:    srl %l0, 0, %i3
-; SPARC64-VIS3-NEXT:    xor %i0, %i5, %i0
-; SPARC64-VIS3-NEXT:    add %i0, %i3, %i0
-; SPARC64-VIS3-NEXT:    cmp %i0, %i3
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %l1
-; SPARC64-VIS3-NEXT:    srl %l1, 0, %i3
-; SPARC64-VIS3-NEXT:    xor %i2, %i5, %i2
-; SPARC64-VIS3-NEXT:    add %i2, %i3, %i2
-; SPARC64-VIS3-NEXT:    ba .LBB0_8
-; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %i4
-; SPARC64-VIS3-NEXT:  .LBB0_6: ! %overflow.no
-; SPARC64-VIS3-NEXT:    mov %g0, %i4
-; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %i5
-; SPARC64-VIS3-NEXT:    add %i5, %i2, %i2
-; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i0
-; SPARC64-VIS3-NEXT:    add %i2, %i0, %i0
-; SPARC64-VIS3-NEXT:  .LBB0_7: ! %overflow.res
+; SPARC64-VIS3-NEXT:    addcc %i0, %g3, %i0
+; SPARC64-VIS3-NEXT:    addxccc %g2, %g4, %i2
+; SPARC64-VIS3-NEXT:    srax %i4, 63, %g2
+; SPARC64-VIS3-NEXT:    xor %i2, %g2, %i2
+; SPARC64-VIS3-NEXT:    xor %i0, %g2, %i0
+; SPARC64-VIS3-NEXT:    or %i0, %i2, %i0
+; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
 ; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i1
-; SPARC64-VIS3-NEXT:  .LBB0_8: ! %overflow.res
-; SPARC64-VIS3-NEXT:    and %i4, 1, %i2
+; SPARC64-VIS3-NEXT:    srl %i5, 0, %i2
 ; SPARC64-VIS3-NEXT:    ret
-; SPARC64-VIS3-NEXT:    restore
+; SPARC64-VIS3-NEXT:    restore %g0, %i4, %o0
 start:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %l, i128 %r)
   %1 = extractvalue { i128, i1 } %0, 0
diff --git a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
index 4533523f97d74..6d197c88bfecd 100644
--- a/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/SPARC/umulo-128-legalisation-lowering.ll
@@ -5,470 +5,207 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC-LABEL: muloti_test:
-; SPARC:       ! %bb.0: ! %overflow.entry
+; SPARC:       ! %bb.0: ! %start
 ; SPARC-NEXT:    save %sp, -96, %sp
-; SPARC-NEXT:    ld [%fp+96], %l1
-; SPARC-NEXT:    ld [%fp+92], %g4
-; SPARC-NEXT:    or %i1, %i0, %l0
-; SPARC-NEXT:    cmp %l0, 0
 ; SPARC-NEXT:    mov %i3, %g2
-; SPARC-NEXT:    be .LBB0_33
-; SPARC-NEXT:    mov %i2, %g3
-; SPARC-NEXT:  ! %bb.1: ! %overflow.lhs
-; SPARC-NEXT:    or %i5, %i4, %l2
-; SPARC-NEXT:    cmp %l2, 0
-; SPARC-NEXT:    be .LBB0_40
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.2: ! %overflow
-; SPARC-NEXT:    umul %g3, %i5, %i2
+; SPARC-NEXT:    mov %i2, %g4
+; SPARC-NEXT:    umul %i2, %i5, %i2
 ; SPARC-NEXT:    rd %y, %l7
-; SPARC-NEXT:    umul %i4, %g2, %i3
-; SPARC-NEXT:    rd %y, %o2
-; SPARC-NEXT:    umul %i5, %g2, %l5
+; SPARC-NEXT:    ld [%fp+92], %l4
+; SPARC-NEXT:    umul %i4, %i3, %i3
+; SPARC-NEXT:    rd %y, %o1
+; SPARC-NEXT:    ld [%fp+96], %g3
+; SPARC-NEXT:    umul %i5, %g2, %l3
 ; SPARC-NEXT:    rd %y, %o0
-; SPARC-NEXT:    umul %g4, %i1, %l4
-; SPARC-NEXT:    rd %y, %l3
+; SPARC-NEXT:    umul %l4, %i1, %l2
+; SPARC-NEXT:    rd %y, %l1
 ; SPARC-NEXT:    add %i3, %i2, %i2
-; SPARC-NEXT:    umul %i0, %l1, %i3
+; SPARC-NEXT:    umul %i0, %g3, %i3
 ; SPARC-NEXT:    rd %y, %l6
-; SPARC-NEXT:    add %o0, %i2, %o1
-; SPARC-NEXT:    umul %i1, %l1, %i1
-; SPARC-NEXT:    rd %y, %i5
-; SPARC-NEXT:    add %i3, %l4, %i2
-; SPARC-NEXT:    add %i5, %i2, %l4
-; SPARC-NEXT:    addcc %i1, %l5, %i1
-; SPARC-NEXT:    umul %g2, %l1, %i3
+; SPARC-NEXT:    add %o0, %i2, %o2
+; SPARC-NEXT:    umul %i1, %g3, %i2
+; SPARC-NEXT:    rd %y, %l0
+; SPARC-NEXT:    add %i3, %l2, %i3
+; SPARC-NEXT:    add %l0, %i3, %l2
+; SPARC-NEXT:    addcc %i2, %l3, %l3
+; SPARC-NEXT:    umul %g2, %g3, %i3
 ; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    addxcc %l4, %o1, %o4
-; SPARC-NEXT:    umul %g3, %l1, %l1
+; SPARC-NEXT:    addxcc %l2, %o2, %o4
+; SPARC-NEXT:    umul %g4, %g3, %g3
 ; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addcc %l1, %i2, %i2
-; SPARC-NEXT:    addxcc %l5, 0, %l1
-; SPARC-NEXT:    umul %g2, %g4, %g2
+; SPARC-NEXT:    addcc %g3, %i2, %i2
+; SPARC-NEXT:    addxcc %l5, 0, %g3
+; SPARC-NEXT:    umul %g2, %l4, %g2
 ; SPARC-NEXT:    rd %y, %l5
 ; SPARC-NEXT:    addcc %g2, %i2, %i2
 ; SPARC-NEXT:    addxcc %l5, 0, %g2
-; SPARC-NEXT:    addcc %l1, %g2, %g2
-; SPARC-NEXT:    addxcc %g0, 0, %l1
-; SPARC-NEXT:    umul %g3, %g4, %l5
+; SPARC-NEXT:    addcc %g3, %g2, %g2
+; SPARC-NEXT:    addxcc %g0, 0, %g3
+; SPARC-NEXT:    umul %g4, %l4, %l5
 ; SPARC-NEXT:    rd %y, %o3
 ; SPARC-NEXT:    addcc %l5, %g2, %l5
-; SPARC-NEXT:    addxcc %o3, %l1, %o3
-; SPARC-NEXT:    addcc %l5, %i1, %i1
-; SPARC-NEXT:    addxcc %o3, %o4, %g2
-; SPARC-NEXT:    mov 1, %l1
-; SPARC-NEXT:    cmp %g2, %o3
-; SPARC-NEXT:    bcs .LBB0_4
-; SPARC-NEXT:    mov %l1, %o4
-; SPARC-NEXT:  ! %bb.3: ! %overflow
+; SPARC-NEXT:    addxcc %o3, %g3, %o3
+; SPARC-NEXT:    addcc %l5, %l3, %g2
+; SPARC-NEXT:    addxcc %o3, %o4, %g3
+; SPARC-NEXT:    mov 1, %l3
+; SPARC-NEXT:    cmp %g3, %o3
+; SPARC-NEXT:    bcs .LBB0_2
+; SPARC-NEXT:    mov %l3, %o4
+; SPARC-NEXT:  ! %bb.1: ! %start
 ; SPARC-NEXT:    mov %g0, %o4
-; SPARC-NEXT:  .LBB0_4: ! %overflow
-; SPARC-NEXT:    cmp %i1, %l5
-; SPARC-NEXT:    bcs .LBB0_6
-; SPARC-NEXT:    mov %l1, %l5
-; SPARC-NEXT:  ! %bb.5: ! %overflow
+; SPARC-NEXT:  .LBB0_2: ! %start
+; SPARC-NEXT:    cmp %g2, %l5
+; SPARC-NEXT:    bcs .LBB0_4
+; SPARC-NEXT:    mov %l3, %l5
+; SPARC-NEXT:  ! %bb.3: ! %start
 ; SPARC-NEXT:    mov %g0, %l5
-; SPARC-NEXT:  .LBB0_6: ! %overflow
-; SPARC-NEXT:    cmp %g2, %o3
-; SPARC-NEXT:    be .LBB0_8
+; SPARC-NEXT:  .LBB0_4: ! %start
+; SPARC-NEXT:    cmp %g3, %o3
+; SPARC-NEXT:    be .LBB0_6
 ; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.7: ! %overflow
+; SPARC-NEXT:  ! %bb.5: ! %start
 ; SPARC-NEXT:    mov %o4, %l5
-; SPARC-NEXT:  .LBB0_8: ! %overflow
+; SPARC-NEXT:  .LBB0_6: ! %start
+; SPARC-NEXT:    cmp %g4, 0
+; SPARC-NEXT:    bne .LBB0_8
+; SPARC-NEXT:    mov %l3, %o3
+; SPARC-NEXT:  ! %bb.7: ! %start
+; SPARC-NEXT:    mov %g0, %o3
+; SPARC-NEXT:  .LBB0_8: ! %start
 ; SPARC-NEXT:    cmp %i4, 0
 ; SPARC-NEXT:    bne .LBB0_10
-; SPARC-NEXT:    mov %l1, %o3
-; SPARC-NEXT:  ! %bb.9: ! %overflow
-; SPARC-NEXT:    mov %g0, %o3
-; SPARC-NEXT:  .LBB0_10: ! %overflow
-; SPARC-NEXT:    cmp %g3, 0
-; SPARC-NEXT:    bne .LBB0_12
-; SPARC-NEXT:    mov %l1, %o4
-; SPARC-NEXT:  ! %bb.11: ! %overflow
+; SPARC-NEXT:    mov %l3, %o4
+; SPARC-NEXT:  ! %bb.9: ! %start
 ; SPARC-NEXT:    mov %g0, %o4
-; SPARC-NEXT:  .LBB0_12: ! %overflow
-; SPARC-NEXT:    cmp %o2, 0
-; SPARC-NEXT:    bne .LBB0_14
-; SPARC-NEXT:    mov %l1, %o2
-; SPARC-NEXT:  ! %bb.13: ! %overflow
-; SPARC-NEXT:    mov %g0, %o2
-; SPARC-NEXT:  .LBB0_14: ! %overflow
+; SPARC-NEXT:  .LBB0_10: ! %start
+; SPARC-NEXT:    cmp %o1, 0
+; SPARC-NEXT:    bne .LBB0_12
+; SPARC-NEXT:    mov %l3, %o1
+; SPARC-NEXT:  ! %bb.11: ! %start
+; SPARC-NEXT:    mov %g0, %o1
+; SPARC-NEXT:  .LBB0_12: ! %start
 ; SPARC-NEXT:    cmp %l7, 0
-; SPARC-NEXT:    bne .LBB0_16
-; SPARC-NEXT:    mov %l1, %g3
-; SPARC-NEXT:  ! %bb.15: ! %overflow
-; SPARC-NEXT:    mov %g0, %g3
-; SPARC-NEXT:  .LBB0_16: ! %overflow
-; SPARC-NEXT:    cmp %o1, %o0
-; SPARC-NEXT:    bcs .LBB0_18
-; SPARC-NEXT:    mov %l1, %i4
-; SPARC-NEXT:  ! %bb.17: ! %overflow
-; SPARC-NEXT:    mov %g0, %i4
-; SPARC-NEXT:  .LBB0_18: ! %overflow
+; SPARC-NEXT:    bne .LBB0_14
+; SPARC-NEXT:    mov %l3, %l7
+; SPARC-NEXT:  ! %bb.13: ! %start
+; SPARC-NEXT:    mov %g0, %l7
+; SPARC-NEXT:  .LBB0_14: ! %start
+; SPARC-NEXT:    cmp %o2, %o0
+; SPARC-NEXT:    bcs .LBB0_16
+; SPARC-NEXT:    mov %l3, %g4
+; SPARC-NEXT:  ! %bb.15: ! %start
+; SPARC-NEXT:    mov %g0, %g4
+; SPARC-NEXT:  .LBB0_16: ! %start
+; SPARC-NEXT:    cmp %l4, 0
+; SPARC-NEXT:    bne .LBB0_18
+; SPARC-NEXT:    mov %l3, %l4
+; SPARC-NEXT:  ! %bb.17: ! %start
+; SPARC-NEXT:    mov %g0, %l4
+; SPARC-NEXT:  .LBB0_18: ! %start
 ; SPARC-NEXT:    cmp %i0, 0
 ; SPARC-NEXT:    bne .LBB0_20
-; SPARC-NEXT:    mov %l1, %i0
-; SPARC-NEXT:  ! %bb.19: ! %overflow
-; SPARC-NEXT:    mov %g0, %i0
-; SPARC-NEXT:  .LBB0_20: ! %overflow
-; SPARC-NEXT:    cmp %g4, 0
-; SPARC-NEXT:    bne .LBB0_22
-; SPARC-NEXT:    mov %l1, %l7
-; SPARC-NEXT:  ! %bb.21: ! %overflow
-; SPARC-NEXT:    mov %g0, %l7
-; SPARC-NEXT:  .LBB0_22: ! %overflow
+; SPARC-NEXT:    mov %l3, %o0
+; SPARC-NEXT:  ! %bb.19: ! %start
+; SPARC-NEXT:    mov %g0, %o0
+; SPARC-NEXT:  .LBB0_20: ! %start
 ; SPARC-NEXT:    cmp %l6, 0
+; SPARC-NEXT:    bne .LBB0_22
+; SPARC-NEXT:    mov %l3, %l6
+; SPARC-NEXT:  ! %bb.21: ! %start
+; SPARC-NEXT:    mov %g0, %l6
+; SPARC-NEXT:  .LBB0_22: ! %start
+; SPARC-NEXT:    and %o4, %o3, %o2
+; SPARC-NEXT:    cmp %l1, 0
+; SPARC-NEXT:    and %o0, %l4, %l4
 ; SPARC-NEXT:    bne .LBB0_24
-; SPARC-NEXT:    mov %l1, %g4
-; SPARC-NEXT:  ! %bb.23: ! %overflow
-; SPARC-NEXT:    mov %g0, %g4
-; SPARC-NEXT:  .LBB0_24: ! %overflow
-; SPARC-NEXT:    and %o3, %o4, %l6
-; SPARC-NEXT:    cmp %l3, 0
-; SPARC-NEXT:    and %i0, %l7, %l7
-; SPARC-NEXT:    bne .LBB0_26
-; SPARC-NEXT:    mov %l1, %i0
-; SPARC-NEXT:  ! %bb.25: ! %overflow
-; SPARC-NEXT:    mov %g0, %i0
-; SPARC-NEXT:  .LBB0_26: ! %overflow
-; SPARC-NEXT:    or %l6, %o2, %l3
-; SPARC-NEXT:    cmp %l4, %i5
-; SPARC-NEXT:    or %l7, %g4, %g4
-; SPARC-NEXT:    bcs .LBB0_28
-; SPARC-NEXT:    mov %l1, %i5
-; SPARC-NEXT:  ! %bb.27: ! %overflow
-; SPARC-NEXT:    mov %g0, %i5
-; SPARC-NEXT:  .LBB0_28: ! %overflow
-; SPARC-NEXT:    or %l3, %g3, %g3
-; SPARC-NEXT:    cmp %l2, 0
-; SPARC-NEXT:    or %g4, %i0, %g4
-; SPARC-NEXT:    bne .LBB0_30
-; SPARC-NEXT:    mov %l1, %i0
-; SPARC-NEXT:  ! %bb.29: ! %overflow
-; SPARC-NEXT:    mov %g0, %i0
-; SPARC-NEXT:  .LBB0_30: ! %overflow
-; SPARC-NEXT:    or %g3, %i4, %i4
-; SPARC-NEXT:    cmp %l0, 0
-; SPARC-NEXT:    bne .LBB0_32
-; SPARC-NEXT:    or %g4, %i5, %i5
-; SPARC-NEXT:  ! %bb.31: ! %overflow
+; SPARC-NEXT:    mov %l3, %l1
+; SPARC-NEXT:  ! %bb.23: ! %start
 ; SPARC-NEXT:    mov %g0, %l1
-; SPARC-NEXT:  .LBB0_32: ! %overflow
-; SPARC-NEXT:    and %l1, %i0, %i0
-; SPARC-NEXT:    or %i0, %i5, %i0
-; SPARC-NEXT:    or %i0, %i4, %i0
-; SPARC-NEXT:    ba .LBB0_49
-; SPARC-NEXT:    or %i0, %l5, %i0
-; SPARC-NEXT:  .LBB0_33: ! %overflow.no.lhs
-; SPARC-NEXT:    or %i5, %i4, %i2
-; SPARC-NEXT:    cmp %i2, 0
-; SPARC-NEXT:    be .LBB0_48
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.34: ! %overflow.no.lhs.only
-; SPARC-NEXT:    umul %g3, %l1, %i2
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    umul %g2, %l1, %i3
-; SPARC-NEXT:    rd %y, %l2
-; SPARC-NEXT:    addcc %i2, %l2, %i2
-; SPARC-NEXT:    addxcc %l0, 0, %l0
-; SPARC-NEXT:    umul %g2, %g4, %l2
-; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addcc %l2, %i2, %i2
-; SPARC-NEXT:    addxcc %l3, 0, %l2
-; SPARC-NEXT:    addcc %l0, %l2, %l0
-; SPARC-NEXT:    addxcc %g0, 0, %l2
-; SPARC-NEXT:    umul %g3, %g4, %l3
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %l3, %l0, %l0
-; SPARC-NEXT:    smul %l1, %i0, %l3
-; SPARC-NEXT:    umul %l1, %i1, %l1
-; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addxcc %l4, %l2, %l2
-; SPARC-NEXT:    add %l5, %l3, %l3
-; SPARC-NEXT:    smul %g4, %i1, %g4
-; SPARC-NEXT:    add %l3, %g4, %g4
-; SPARC-NEXT:    addcc %l0, %l1, %l0
-; SPARC-NEXT:    umul %g2, %i5, %l1
-; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addxcc %l2, %g4, %g4
-; SPARC-NEXT:    umul %g3, %i5, %l2
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %l2, %l3, %l2
-; SPARC-NEXT:    addxcc %l4, 0, %l3
-; SPARC-NEXT:    umul %g2, %i4, %g2
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %g2, %l2, %g2
-; SPARC-NEXT:    addxcc %l4, 0, %l2
-; SPARC-NEXT:    addcc %l3, %l2, %l2
-; SPARC-NEXT:    addxcc %g0, 0, %l3
-; SPARC-NEXT:    umul %g3, %i4, %g3
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %g3, %l2, %g3
-; SPARC-NEXT:    smul %i5, %i0, %i0
-; SPARC-NEXT:    umul %i5, %i1, %i5
-; SPARC-NEXT:    rd %y, %l2
-; SPARC-NEXT:    addxcc %l4, %l3, %l3
-; SPARC-NEXT:    add %l2, %i0, %i0
-; SPARC-NEXT:    smul %i4, %i1, %i1
-; SPARC-NEXT:    add %i0, %i1, %i0
-; SPARC-NEXT:    addcc %g3, %i5, %i4
-; SPARC-NEXT:    addxcc %l3, %i0, %i5
-; SPARC-NEXT:    addcc %l0, %l1, %i1
-; SPARC-NEXT:    addxcc %g4, %g2, %g2
-; SPARC-NEXT:    mov 1, %i0
-; SPARC-NEXT:    cmp %g2, %g4
-; SPARC-NEXT:    bcs .LBB0_36
-; SPARC-NEXT:    mov %i0, %g3
-; SPARC-NEXT:  ! %bb.35: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %g3
-; SPARC-NEXT:  .LBB0_36: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %i1, %l0
-; SPARC-NEXT:    bcs .LBB0_38
-; SPARC-NEXT:    mov %i0, %l0
-; SPARC-NEXT:  ! %bb.37: ! %overflow.no.lhs.only
-; SPARC-NEXT:    mov %g0, %l0
-; SPARC-NEXT:  .LBB0_38: ! %overflow.no.lhs.only
-; SPARC-NEXT:    cmp %g2, %g4
-; SPARC-NEXT:    be .LBB0_46
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.39: ! %overflow.no.lhs.only
-; SPARC-NEXT:    ba .LBB0_46
-; SPARC-NEXT:    mov %g3, %l0
-; SPARC-NEXT:  .LBB0_40: ! %overflow.no.rhs.only
-; SPARC-NEXT:    umul %g4, %g2, %i2
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    umul %l1, %g2, %i3
-; SPARC-NEXT:    rd %y, %l2
-; SPARC-NEXT:    addcc %i2, %l2, %i2
-; SPARC-NEXT:    addxcc %l0, 0, %l0
-; SPARC-NEXT:    umul %l1, %g3, %l2
-; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addcc %l2, %i2, %i2
-; SPARC-NEXT:    addxcc %l3, 0, %l2
-; SPARC-NEXT:    addcc %l0, %l2, %l0
-; SPARC-NEXT:    addxcc %g0, 0, %l2
-; SPARC-NEXT:    umul %g4, %g3, %l3
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %l3, %l0, %l0
-; SPARC-NEXT:    smul %g2, %i4, %l3
-; SPARC-NEXT:    umul %g2, %i5, %g2
-; SPARC-NEXT:    rd %y, %l5
-; SPARC-NEXT:    addxcc %l4, %l2, %l2
-; SPARC-NEXT:    add %l5, %l3, %l3
-; SPARC-NEXT:    smul %g3, %i5, %g3
-; SPARC-NEXT:    add %l3, %g3, %g3
-; SPARC-NEXT:    addcc %l0, %g2, %l0
-; SPARC-NEXT:    umul %l1, %i1, %g2
-; SPARC-NEXT:    rd %y, %l3
-; SPARC-NEXT:    addxcc %l2, %g3, %g3
-; SPARC-NEXT:    umul %g4, %i1, %l2
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %l2, %l3, %l2
-; SPARC-NEXT:    addxcc %l4, 0, %l3
-; SPARC-NEXT:    umul %l1, %i0, %l1
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %l1, %l2, %l1
-; SPARC-NEXT:    addxcc %l4, 0, %l2
-; SPARC-NEXT:    addcc %l3, %l2, %l2
-; SPARC-NEXT:    addxcc %g0, 0, %l3
-; SPARC-NEXT:    umul %g4, %i0, %g4
-; SPARC-NEXT:    rd %y, %l4
-; SPARC-NEXT:    addcc %g4, %l2, %g4
-; SPARC-NEXT:    smul %i1, %i4, %i4
-; SPARC-NEXT:    umul %i1, %i5, %i1
-; SPARC-NEXT:    rd %y, %l2
-; SPARC-NEXT:    addxcc %l4, %l3, %l3
-; SPARC-NEXT:    add %l2, %i4, %i4
-; SPARC-NEXT:    smul %i0, %i5, %i0
-; SPARC-NEXT:    add %i4, %i0, %i0
-; SPARC-NEXT:    addcc %g4, %i1, %i4
-; SPARC-NEXT:    addxcc %l3, %i0, %i5
-; SPARC-NEXT:    addcc %l0, %g2, %i1
-; SPARC-NEXT:    addxcc %g3, %l1, %g2
-; SPARC-NEXT:    mov 1, %i0
-; SPARC-NEXT:    cmp %g2, %g3
-; SPARC-NEXT:    bcs .LBB0_42
-; SPARC-NEXT:    mov %i0, %g4
-; SPARC-NEXT:  ! %bb.41: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g0, %g4
-; SPARC-NEXT:  .LBB0_42: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %i1, %l0
-; SPARC-NEXT:    bcs .LBB0_44
-; SPARC-NEXT:    mov %i0, %l0
-; SPARC-NEXT:  ! %bb.43: ! %overflow.no.rhs.only
+; SPARC-NEXT:  .LBB0_24: ! %start
+; SPARC-NEXT:    or %o2, %o1, %o0
+; SPARC-NEXT:    cmp %l2, %l0
+; SPARC-NEXT:    or %l4, %l6, %l4
+; SPARC-NEXT:    bcs .LBB0_26
+; SPARC-NEXT:    mov %l3, %l0
+; SPARC-NEXT:  ! %bb.25: ! %start
 ; SPARC-NEXT:    mov %g0, %l0
-; SPARC-NEXT:  .LBB0_44: ! %overflow.no.rhs.only
-; SPARC-NEXT:    cmp %g2, %g3
-; SPARC-NEXT:    be .LBB0_46
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.45: ! %overflow.no.rhs.only
-; SPARC-NEXT:    mov %g4, %l0
-; SPARC-NEXT:  .LBB0_46: ! %overflow.no.rhs.only
-; SPARC-NEXT:    addcc %i4, %l0, %i4
-; SPARC-NEXT:    addxcc %i5, 0, %i5
-; SPARC-NEXT:    or %i4, %i5, %i4
+; SPARC-NEXT:  .LBB0_26: ! %start
+; SPARC-NEXT:    or %o0, %l7, %l2
+; SPARC-NEXT:    or %i5, %i4, %i4
 ; SPARC-NEXT:    cmp %i4, 0
-; SPARC-NEXT:    bne .LBB0_49
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.47: ! %overflow.no.rhs.only
-; SPARC-NEXT:    ba .LBB0_49
-; SPARC-NEXT:    mov %g0, %i0
-; SPARC-NEXT:  .LBB0_48: ! %overflow.no
-; SPARC-NEXT:    smul %l1, %i0, %i3
-; SPARC-NEXT:    umul %l1, %i1, %i2
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    mov %g0, %i0
-; SPARC-NEXT:    add %l0, %i3, %i3
-; SPARC-NEXT:    smul %g4, %i1, %i1
-; SPARC-NEXT:    smul %i5, %g3, %l0
-; SPARC-NEXT:    umul %i5, %g2, %i5
-; SPARC-NEXT:    rd %y, %l2
-; SPARC-NEXT:    add %i3, %i1, %i1
-; SPARC-NEXT:    add %l2, %l0, %i3
-; SPARC-NEXT:    smul %i4, %g2, %i4
-; SPARC-NEXT:    add %i3, %i4, %i4
-; SPARC-NEXT:    addcc %i5, %i2, %i5
-; SPARC-NEXT:    umul %g2, %l1, %i3
-; SPARC-NEXT:    rd %y, %i2
-; SPARC-NEXT:    addxcc %i4, %i1, %i4
-; SPARC-NEXT:    umul %g3, %l1, %i1
-; SPARC-NEXT:    rd %y, %l0
-; SPARC-NEXT:    addcc %i1, %i2, %i1
-; SPARC-NEXT:    addxcc %l0, 0, %l0
-; SPARC-NEXT:    umul %g2, %g4, %i2
-; SPARC-NEXT:    rd %y, %g2
-; SPARC-NEXT:    addcc %i2, %i1, %i2
-; SPARC-NEXT:    addxcc %g2, 0, %i1
-; SPARC-NEXT:    addcc %l0, %i1, %i1
-; SPARC-NEXT:    addxcc %g0, 0, %g2
-; SPARC-NEXT:    umul %g3, %g4, %g3
-; SPARC-NEXT:    rd %y, %g4
-; SPARC-NEXT:    addcc %g3, %i1, %i1
-; SPARC-NEXT:    addxcc %g4, %g2, %g2
-; SPARC-NEXT:    addcc %i1, %i5, %i1
-; SPARC-NEXT:    addxcc %g2, %i4, %g2
-; SPARC-NEXT:  .LBB0_49: ! %overflow.res
+; SPARC-NEXT:    or %l4, %l1, %l1
+; SPARC-NEXT:    bne .LBB0_28
+; SPARC-NEXT:    mov %l3, %i4
+; SPARC-NEXT:  ! %bb.27: ! %start
+; SPARC-NEXT:    mov %g0, %i4
+; SPARC-NEXT:  .LBB0_28: ! %start
+; SPARC-NEXT:    or %l2, %g4, %i5
+; SPARC-NEXT:    or %i1, %i0, %i0
+; SPARC-NEXT:    cmp %i0, 0
+; SPARC-NEXT:    bne .LBB0_30
+; SPARC-NEXT:    or %l1, %l0, %i0
+; SPARC-NEXT:  ! %bb.29: ! %start
+; SPARC-NEXT:    mov %g0, %l3
+; SPARC-NEXT:  .LBB0_30: ! %start
+; SPARC-NEXT:    and %l3, %i4, %i1
+; SPARC-NEXT:    or %i1, %i0, %i0
+; SPARC-NEXT:    or %i0, %i5, %i0
+; SPARC-NEXT:    or %i0, %l5, %i0
 ; SPARC-NEXT:    and %i0, 1, %i4
+; SPARC-NEXT:    mov %g3, %i0
 ; SPARC-NEXT:    ret
-; SPARC-NEXT:    restore %g0, %g2, %o0
+; SPARC-NEXT:    restore %g0, %g2, %o1
 ;
 ; SPARC64-LABEL: muloti_test:
 ; SPARC64:         .register %g2, #scratch
 ; SPARC64-NEXT:    .register %g3, #scratch
-; SPARC64-NEXT:  ! %bb.0: ! %overflow.entry
+; SPARC64-NEXT:  ! %bb.0: ! %start
 ; SPARC64-NEXT:    save %sp, -176, %sp
-; SPARC64-NEXT:    brz %i0, .LBB0_3
-; SPARC64-NEXT:    mov %i1, %i4
-; SPARC64-NEXT:  ! %bb.1: ! %overflow.lhs
-; SPARC64-NEXT:    brz %i2, .LBB0_5
-; SPARC64-NEXT:    nop
-; SPARC64-NEXT:  ! %bb.2: ! %overflow
+; SPARC64-NEXT:    mov %i0, %l1
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i2, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    mov %o0, %i5
-; SPARC64-NEXT:    mov %o1, %i1
+; SPARC64-NEXT:    mov %i1, %o3
+; SPARC64-NEXT:    mov %o0, %i4
+; SPARC64-NEXT:    mov %o1, %i5
 ; SPARC64-NEXT:    mov %g0, %o0
 ; SPARC64-NEXT:    mov %i0, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i3, %o3
 ; SPARC64-NEXT:    mov %o0, %l0
-; SPARC64-NEXT:    add %o1, %i1, %l1
+; SPARC64-NEXT:    add %o1, %i5, %i0
 ; SPARC64-NEXT:    mov %g0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
+; SPARC64-NEXT:    mov %i1, %o1
 ; SPARC64-NEXT:    mov %g0, %o2
 ; SPARC64-NEXT:    call __multi3
 ; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %g0, %i4
+; SPARC64-NEXT:    mov %g0, %i1
+; SPARC64-NEXT:    mov %g0, %i3
+; SPARC64-NEXT:    mov %g0, %i5
 ; SPARC64-NEXT:    mov %g0, %g2
 ; SPARC64-NEXT:    mov %g0, %g3
-; SPARC64-NEXT:    mov %g0, %g4
-; SPARC64-NEXT:    mov %g0, %g5
-; SPARC64-NEXT:    add %o0, %l1, %i3
-; SPARC64-NEXT:    cmp %i3, %o0
-; SPARC64-NEXT:    movrnz %i2, 1, %g2
-; SPARC64-NEXT:    movrnz %i0, 1, %g3
-; SPARC64-NEXT:    and %g3, %g2, %i0
-; SPARC64-NEXT:    movcs %xcc, 1, %i4
-; SPARC64-NEXT:    movrnz %l0, 1, %g4
-; SPARC64-NEXT:    or %i0, %g4, %i0
-; SPARC64-NEXT:    movrnz %i5, 1, %g5
-; SPARC64-NEXT:    or %i0, %g5, %i0
-; SPARC64-NEXT:    ba .LBB0_8
-; SPARC64-NEXT:    or %i0, %i4, %i0
-; SPARC64-NEXT:  .LBB0_3: ! %overflow.no.lhs
-; SPARC64-NEXT:    brz %i2, .LBB0_7
-; SPARC64-NEXT:    nop
-; SPARC64-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
-; SPARC64-NEXT:    mov %i0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    mov %g0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    mov %o0, %i5
-; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %i0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    mov %g0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i2, %o3
-; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    mov %g0, %i0
-; SPARC64-NEXT:    add %i5, %o1, %i3
-; SPARC64-NEXT:    ba .LBB0_6
-; SPARC64-NEXT:    cmp %i3, %i5
-; SPARC64-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
-; SPARC64-NEXT:    mov %i2, %o0
-; SPARC64-NEXT:    mov %i3, %o1
-; SPARC64-NEXT:    mov %g0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i4, %o3
-; SPARC64-NEXT:    mov %o0, %i4
-; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %i2, %o0
-; SPARC64-NEXT:    mov %i3, %o1
-; SPARC64-NEXT:    mov %g0, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i0, %o3
-; SPARC64-NEXT:    mov %g0, %i2
-; SPARC64-NEXT:    mov %g0, %i0
-; SPARC64-NEXT:    add %i4, %o1, %i3
-; SPARC64-NEXT:    cmp %i3, %i4
-; SPARC64-NEXT:  .LBB0_6: ! %overflow.res
-; SPARC64-NEXT:    movcs %xcc, 1, %i2
-; SPARC64-NEXT:    srl %i2, 0, %i2
-; SPARC64-NEXT:    add %o0, %i2, %i2
-; SPARC64-NEXT:    ba .LBB0_8
-; SPARC64-NEXT:    movrnz %i2, 1, %i0
-; SPARC64-NEXT:  .LBB0_7: ! %overflow.no
-; SPARC64-NEXT:    mov %i0, %o0
-; SPARC64-NEXT:    mov %i4, %o1
-; SPARC64-NEXT:    mov %i2, %o2
-; SPARC64-NEXT:    call __multi3
-; SPARC64-NEXT:    mov %i3, %o3
-; SPARC64-NEXT:    mov %o0, %i3
-; SPARC64-NEXT:    mov %o1, %i1
-; SPARC64-NEXT:    mov %g0, %i0
-; SPARC64-NEXT:  .LBB0_8: ! %overflow.res
-; SPARC64-NEXT:    and %i0, 1, %i2
+; SPARC64-NEXT:    add %o0, %i0, %i0
+; SPARC64-NEXT:    cmp %i0, %o0
+; SPARC64-NEXT:    movrnz %l0, 1, %i3
+; SPARC64-NEXT:    movrnz %i2, 1, %i5
+; SPARC64-NEXT:    movrnz %l1, 1, %g2
+; SPARC64-NEXT:    movcs %xcc, 1, %i1
+; SPARC64-NEXT:    and %g2, %i5, %i2
+; SPARC64-NEXT:    or %i2, %i3, %i2
+; SPARC64-NEXT:    movrnz %i4, 1, %g3
+; SPARC64-NEXT:    or %i2, %g3, %i2
+; SPARC64-NEXT:    or %i2, %i1, %i1
+; SPARC64-NEXT:    srl %i1, 0, %i2
 ; SPARC64-NEXT:    ret
-; SPARC64-NEXT:    restore %g0, %i3, %o0
+; SPARC64-NEXT:    restore %g0, %o1, %o1
 ;
 ; SPARC64-VIS3-LABEL: muloti_test:
 ; SPARC64-VIS3:         .register %g2, #scratch
 ; SPARC64-VIS3-NEXT:    .register %g3, #scratch
-; SPARC64-VIS3-NEXT:  ! %bb.0: ! %overflow.entry
+; SPARC64-VIS3-NEXT:  ! %bb.0: ! %start
 ; SPARC64-VIS3-NEXT:    save %sp, -128, %sp
-; SPARC64-VIS3-NEXT:    brz %i0, .LBB0_3
-; SPARC64-VIS3-NEXT:    nop
-; SPARC64-VIS3-NEXT:  ! %bb.1: ! %overflow.lhs
-; SPARC64-VIS3-NEXT:    brz %i2, .LBB0_5
-; SPARC64-VIS3-NEXT:    nop
-; SPARC64-VIS3-NEXT:  ! %bb.2: ! %overflow
 ; SPARC64-VIS3-NEXT:    mov %g0, %i5
 ; SPARC64-VIS3-NEXT:    mov %g0, %g2
 ; SPARC64-VIS3-NEXT:    mov %g0, %g3
@@ -490,59 +227,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) nounwind {
 ; SPARC64-VIS3-NEXT:    umulxhi %i2, %i1, %i2
 ; SPARC64-VIS3-NEXT:    movrnz %i2, 1, %g5
 ; SPARC64-VIS3-NEXT:    or %i0, %g5, %i0
-; SPARC64-VIS3-NEXT:    ba .LBB0_7
-; SPARC64-VIS3-NEXT:    or %i0, %i5, %i5
-; SPARC64-VIS3-NEXT:  .LBB0_3: ! %overflow.no.lhs
-; SPARC64-VIS3-NEXT:    brz %i2, .LBB0_6
-; SPARC64-VIS3-NEXT:    nop
-; SPARC64-VIS3-NEXT:  ! %bb.4: ! %overflow.no.lhs.only
-; SPARC64-VIS3-NEXT:    mov %g0, %g2
-; SPARC64-VIS3-NEXT:    mov %g0, %i5
-; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i4
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %g3
-; SPARC64-VIS3-NEXT:    add %g3, %i4, %g3
-; SPARC64-VIS3-NEXT:    mulx %i0, %i2, %i0
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i2, %i4
-; SPARC64-VIS3-NEXT:    add %i4, %i0, %i0
-; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i3
-; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
-; SPARC64-VIS3-NEXT:    mov %i3, %i1
-; SPARC64-VIS3-NEXT:    add %g3, %i2, %i4
-; SPARC64-VIS3-NEXT:    cmp %i4, %g3
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g2
-; SPARC64-VIS3-NEXT:    srl %g2, 0, %i2
-; SPARC64-VIS3-NEXT:    add %i0, %i2, %i0
-; SPARC64-VIS3-NEXT:    ba .LBB0_8
-; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
-; SPARC64-VIS3-NEXT:  .LBB0_5: ! %overflow.no.rhs.only
-; SPARC64-VIS3-NEXT:    mov %g0, %g2
-; SPARC64-VIS3-NEXT:    mov %g0, %i5
-; SPARC64-VIS3-NEXT:    mulx %i2, %i1, %i4
-; SPARC64-VIS3-NEXT:    umulxhi %i3, %i1, %g3
-; SPARC64-VIS3-NEXT:    add %g3, %i4, %g3
-; SPARC64-VIS3-NEXT:    mulx %i2, %i0, %i2
-; SPARC64-VIS3-NEXT:    umulxhi %i3, %i0, %i4
-; SPARC64-VIS3-NEXT:    add %i4, %i2, %i2
-; SPARC64-VIS3-NEXT:    mulx %i3, %i1, %i1
-; SPARC64-VIS3-NEXT:    mulx %i3, %i0, %i0
-; SPARC64-VIS3-NEXT:    add %g3, %i0, %i4
-; SPARC64-VIS3-NEXT:    cmp %i4, %g3
-; SPARC64-VIS3-NEXT:    movcs %xcc, 1, %g2
-; SPARC64-VIS3-NEXT:    srl %g2, 0, %i0
-; SPARC64-VIS3-NEXT:    add %i2, %i0, %i0
-; SPARC64-VIS3-NEXT:    ba .LBB0_8
-; SPARC64-VIS3-NEXT:    movrnz %i0, 1, %i5
-; SPARC64-VIS3-NEXT:  .LBB0_6: ! %overflow.no
-; SPARC64-VIS3-NEXT:    mov %g0, %i5
-; SPARC64-VIS3-NEXT:    mulx %i1, %i2, %i2
-; SPARC64-VIS3-NEXT:    umulxhi %i1, %i3, %i4
-; SPARC64-VIS3-NEXT:    add %i4, %i2, %i2
-; SPARC64-VIS3-NEXT:    mulx %i0, %i3, %i0
-; SPARC64-VIS3-NEXT:    add %i2, %i0, %i4
-; SPARC64-VIS3-NEXT:  .LBB0_7: ! %overflow.res
+; SPARC64-VIS3-NEXT:    or %i0, %i5, %i0
 ; SPARC64-VIS3-NEXT:    mulx %i1, %i3, %i1
-; SPARC64-VIS3-NEXT:  .LBB0_8: ! %overflow.res
-; SPARC64-VIS3-NEXT:    and %i5, 1, %i2
+; SPARC64-VIS3-NEXT:    srl %i0, 0, %i2
 ; SPARC64-VIS3-NEXT:    ret
 ; SPARC64-VIS3-NEXT:    restore %g0, %i4, %o0
 start:
diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
index c19ce3f34011e..9b5fa1c2bc811 100644
--- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll
@@ -3,568 +3,200 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV6-LABEL: muloti_test:
-; THUMBV6:       @ %bb.0: @ %overflow.entry
+; THUMBV6:       @ %bb.0: @ %start
 ; THUMBV6-NEXT:    .save {r4, r5, r6, r7, lr}
 ; THUMBV6-NEXT:    push {r4, r5, r6, r7, lr}
-; THUMBV6-NEXT:    .pad #84
-; THUMBV6-NEXT:    sub sp, #84
+; THUMBV6-NEXT:    .pad #60
+; THUMBV6-NEXT:    sub sp, #60
 ; THUMBV6-NEXT:    mov r6, r3
-; THUMBV6-NEXT:    str r0, [sp, #48] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #108]
-; THUMBV6-NEXT:    ldr r5, [sp, #104]
-; THUMBV6-NEXT:    str r5, [sp, #56] @ 4-byte Spill
-; THUMBV6-NEXT:    str r0, [sp, #52] @ 4-byte Spill
-; THUMBV6-NEXT:    orrs r5, r0
-; THUMBV6-NEXT:    ldr r1, [sp, #124]
-; THUMBV6-NEXT:    ldr r4, [sp, #120]
-; THUMBV6-NEXT:    ldr r0, [sp, #116]
-; THUMBV6-NEXT:    str r0, [sp, #68] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r3, [sp, #112]
-; THUMBV6-NEXT:    str r4, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #60] @ 4-byte Spill
-; THUMBV6-NEXT:    str r2, [sp, #72] @ 4-byte Spill
-; THUMBV6-NEXT:    str r6, [sp, #76] @ 4-byte Spill
-; THUMBV6-NEXT:    str r3, [sp, #64] @ 4-byte Spill
-; THUMBV6-NEXT:    bne .LBB0_1
-; THUMBV6-NEXT:    b .LBB0_3
-; THUMBV6-NEXT:  .LBB0_1: @ %overflow.lhs
-; THUMBV6-NEXT:    orrs r4, r1
-; THUMBV6-NEXT:    bne .LBB0_2
-; THUMBV6-NEXT:    b .LBB0_5
-; THUMBV6-NEXT:  .LBB0_2: @ %overflow
-; THUMBV6-NEXT:    str r4, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    movs r4, #0
-; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r7, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r6, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
+; THUMBV6-NEXT:    mov r1, r2
+; THUMBV6-NEXT:    str r2, [sp, #52] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r4, r0
 ; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    ldr r2, [sp, #88]
+; THUMBV6-NEXT:    str r2, [sp, #48] @ 4-byte Spill
+; THUMBV6-NEXT:    movs r5, #0
+; THUMBV6-NEXT:    mov r0, r1
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r1, r0
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    adcs r1, r4
 ; THUMBV6-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r7, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; THUMBV6-NEXT:    str r5, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r5, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r0, r5
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r6, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r1, [sp, #16] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r7, r6
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r2, r1, r2
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    str r1, [sp, #56] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r2, r0
-; THUMBV6-NEXT:    str r2, [sp, #8] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r6, [sp, #72] @ 4-byte Reload
+; THUMBV6-NEXT:    str r0, [r4]
+; THUMBV6-NEXT:    ldr r2, [sp, #96]
+; THUMBV6-NEXT:    str r2, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r4, r6
+; THUMBV6-NEXT:    str r6, [sp, #56] @ 4-byte Spill
 ; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    mov r7, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r7, r4
-; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r6, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    adds r0, r7, r1
-; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r7, r4
-; THUMBV6-NEXT:    adcs r7, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    subs r0, r1, #1
+; THUMBV6-NEXT:    sbcs r7, r0
+; THUMBV6-NEXT:    ldr r0, [sp, #100]
+; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    ldr r6, [sp, #52] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r7
-; THUMBV6-NEXT:    str r1, [sp] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r6
-; THUMBV6-NEXT:    mov r2, r4
-; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r0
-; THUMBV6-NEXT:    str r1, [sp, #64] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r7, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r7
-; THUMBV6-NEXT:    mov r2, r4
+; THUMBV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
+; THUMBV6-NEXT:    subs r2, r1, #1
+; THUMBV6-NEXT:    sbcs r1, r2
+; THUMBV6-NEXT:    subs r2, r4, #1
 ; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    adds r0, r0, r6
-; THUMBV6-NEXT:    ldr r2, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r2, r0
-; THUMBV6-NEXT:    ldr r2, [sp] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #72] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r0
-; THUMBV6-NEXT:    adcs r4, r4
-; THUMBV6-NEXT:    ldr r3, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r3, #1
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r0, #1
-; THUMBV6-NEXT:    sbcs r0, r2
-; THUMBV6-NEXT:    subs r2, r7, #1
-; THUMBV6-NEXT:    sbcs r7, r2
-; THUMBV6-NEXT:    mov r6, r7
-; THUMBV6-NEXT:    ldr r7, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r7, #1
-; THUMBV6-NEXT:    sbcs r7, r2
-; THUMBV6-NEXT:    ands r7, r6
-; THUMBV6-NEXT:    orrs r7, r0
-; THUMBV6-NEXT:    orrs r7, r3
-; THUMBV6-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r7, r0
-; THUMBV6-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r0, #1
-; THUMBV6-NEXT:    sbcs r0, r2
-; THUMBV6-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r3, #1
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    mov r6, r3
-; THUMBV6-NEXT:    ldr r3, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r3, #1
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    subs r2, r5, #1
-; THUMBV6-NEXT:    sbcs r5, r2
-; THUMBV6-NEXT:    ands r5, r3
-; THUMBV6-NEXT:    orrs r5, r6
-; THUMBV6-NEXT:    orrs r5, r0
-; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r5, r2
-; THUMBV6-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r3, #1
 ; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    mov r6, r3
-; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    subs r2, r2, #1
-; THUMBV6-NEXT:    ldr r3, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    sbcs r3, r2
-; THUMBV6-NEXT:    str r3, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    ands r2, r6
-; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r2, r5
-; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r5, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    orrs r5, r7
-; THUMBV6-NEXT:    orrs r5, r4
-; THUMBV6-NEXT:    b .LBB0_8
-; THUMBV6-NEXT:  .LBB0_3: @ %overflow.no.lhs
-; THUMBV6-NEXT:    mov r0, r4
-; THUMBV6-NEXT:    orrs r0, r1
-; THUMBV6-NEXT:    bne .LBB0_4
-; THUMBV6-NEXT:    b .LBB0_7
-; THUMBV6-NEXT:  .LBB0_4: @ %overflow.no.lhs.only
-; THUMBV6-NEXT:    mov r5, r4
-; THUMBV6-NEXT:    movs r4, #0
-; THUMBV6-NEXT:    mov r0, r2
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r7, r2
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    str r5, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r6, r4
-; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r5, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    adds r0, r6, r1
-; THUMBV6-NEXT:    str r0, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r7, r4
-; THUMBV6-NEXT:    adcs r7, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r6, r7
+; THUMBV6-NEXT:    ldr r4, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r4, #1
+; THUMBV6-NEXT:    sbcs r4, r2
+; THUMBV6-NEXT:    ands r4, r3
+; THUMBV6-NEXT:    orrs r4, r1
+; THUMBV6-NEXT:    orrs r4, r7
+; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r7, r1, r0
 ; THUMBV6-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r7, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    ldr r5, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r2, r4
-; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    str r0, [sp, #24] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r3, r0, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r2, r1
-; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r3
-; THUMBV6-NEXT:    str r0, [sp, #60] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r2, r6
-; THUMBV6-NEXT:    str r2, [sp, #80] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r6, r1
+; THUMBV6-NEXT:    adds r0, r1, r7
+; THUMBV6-NEXT:    str r0, [sp, #20] @ 4-byte Spill
 ; THUMBV6-NEXT:    mov r0, r5
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r7, r1
-; THUMBV6-NEXT:    adds r6, r0, r6
-; THUMBV6-NEXT:    adcs r7, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r5, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    adds r0, r0, r6
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    adds r0, r7, r1
-; THUMBV6-NEXT:    str r0, [sp, #72] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r7, r4
-; THUMBV6-NEXT:    adcs r7, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #76] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r6, r7
-; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r0, r5
+; THUMBV6-NEXT:    orrs r0, r4
+; THUMBV6-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #92]
+; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r7, [sp, #80]
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r5, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r2, r0
-; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r2, r0
-; THUMBV6-NEXT:    adcs r1, r6
-; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    b .LBB0_6
-; THUMBV6-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
-; THUMBV6-NEXT:    movs r4, #0
-; THUMBV6-NEXT:    mov r0, r3
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r7, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r5, r3
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
 ; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r7, r0, r1
-; THUMBV6-NEXT:    adcs r6, r4
-; THUMBV6-NEXT:    mov r0, r5
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r5, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    adds r0, r0, r7
-; THUMBV6-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    adds r0, r6, r1
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r6, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r7, r4
-; THUMBV6-NEXT:    adcs r7, r4
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r4, r1
+; THUMBV6-NEXT:    subs r0, r1, #1
+; THUMBV6-NEXT:    sbcs r4, r0
+; THUMBV6-NEXT:    ldr r6, [sp, #84]
 ; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r5
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r7
-; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r2, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r3, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #56] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r5, r1
-; THUMBV6-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r6
-; THUMBV6-NEXT:    mov r2, r4
-; THUMBV6-NEXT:    mov r3, r4
+; THUMBV6-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #52] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r2, r0, r2
-; THUMBV6-NEXT:    adcs r5, r1
-; THUMBV6-NEXT:    ldr r0, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #56] @ 4-byte Spill
-; THUMBV6-NEXT:    ldr r0, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r5, r0
+; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV6-NEXT:    subs r2, r1, #1
+; THUMBV6-NEXT:    sbcs r1, r2
+; THUMBV6-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; THUMBV6-NEXT:    subs r2, r6, #1
+; THUMBV6-NEXT:    sbcs r6, r2
+; THUMBV6-NEXT:    ands r6, r3
+; THUMBV6-NEXT:    orrs r6, r1
+; THUMBV6-NEXT:    orrs r6, r4
+; THUMBV6-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r1, r0
+; THUMBV6-NEXT:    str r0, [sp, #4] @ 4-byte Spill
 ; THUMBV6-NEXT:    mov r0, r7
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r7, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r6
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r7
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r7, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r7, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    ldr r6, [sp, #76] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:    adds r0, r7, r1
-; THUMBV6-NEXT:    str r0, [sp, #64] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r7, r4
-; THUMBV6-NEXT:    adcs r7, r4
-; THUMBV6-NEXT:    ldr r0, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r4
-; THUMBV6-NEXT:    mov r2, r6
-; THUMBV6-NEXT:    mov r3, r4
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r6, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #68] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r6, r7
-; THUMBV6-NEXT:    add r2, sp, #72
-; THUMBV6-NEXT:    ldm r2, {r0, r1, r2} @ 12-byte Folded Reload
-; THUMBV6-NEXT:    ldr r3, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r2, r0
-; THUMBV6-NEXT:    ldr r2, [sp, #52] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r2, r0
-; THUMBV6-NEXT:    adcs r1, r6
-; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
-; THUMBV6-NEXT:    adcs r1, r2
-; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:  .LBB0_6: @ %overflow.res
-; THUMBV6-NEXT:    adcs r2, r4
-; THUMBV6-NEXT:    adcs r5, r4
-; THUMBV6-NEXT:    orrs r5, r2
-; THUMBV6-NEXT:    subs r2, r5, #1
-; THUMBV6-NEXT:    sbcs r5, r2
-; THUMBV6-NEXT:    b .LBB0_8
-; THUMBV6-NEXT:  .LBB0_7: @ %overflow.no
-; THUMBV6-NEXT:    movs r5, #0
-; THUMBV6-NEXT:    mov r0, r2
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r7, r2
-; THUMBV6-NEXT:    mov r2, r3
-; THUMBV6-NEXT:    mov r4, r3
+; THUMBV6-NEXT:    ldr r4, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r4
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    str r0, [sp, #44] @ 4-byte Spill
-; THUMBV6-NEXT:    str r1, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r0, r6
+; THUMBV6-NEXT:    str r0, [sp, #12] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r1, r0
+; THUMBV6-NEXT:    mov r1, r5
+; THUMBV6-NEXT:    adcs r1, r5
+; THUMBV6-NEXT:    orrs r1, r6
+; THUMBV6-NEXT:    ldr r3, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r3, r2
+; THUMBV6-NEXT:    subs r2, r3, #1
+; THUMBV6-NEXT:    sbcs r3, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r7, r2
+; THUMBV6-NEXT:    subs r2, r7, #1
+; THUMBV6-NEXT:    sbcs r7, r2
+; THUMBV6-NEXT:    ands r7, r3
+; THUMBV6-NEXT:    orrs r7, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; THUMBV6-NEXT:    orrs r7, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r1, r2, r1
+; THUMBV6-NEXT:    str r1, [sp, #32] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r0, r1
+; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
 ; THUMBV6-NEXT:    mov r2, r4
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r4, r6
-; THUMBV6-NEXT:    mov r6, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r6, r5
-; THUMBV6-NEXT:    mov r0, r7
+; THUMBV6-NEXT:    mov r4, r1
+; THUMBV6-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r6, r0, r1
+; THUMBV6-NEXT:    adcs r4, r5
+; THUMBV6-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    ldr r7, [sp, #68] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r2, r7
+; THUMBV6-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
+; THUMBV6-NEXT:    adds r0, r0, r6
 ; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r2
-; THUMBV6-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV6-NEXT:    str r0, [r2, #4]
 ; THUMBV6-NEXT:    adcs r1, r5
-; THUMBV6-NEXT:    adds r0, r6, r1
-; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    mov r6, r7
-; THUMBV6-NEXT:    mov r7, r5
-; THUMBV6-NEXT:    adcs r7, r5
-; THUMBV6-NEXT:    mov r0, r4
+; THUMBV6-NEXT:    adds r0, r4, r1
+; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    mov r6, r5
+; THUMBV6-NEXT:    adcs r6, r5
+; THUMBV6-NEXT:    ldr r0, [sp, #56] @ 4-byte Reload
 ; THUMBV6-NEXT:    mov r1, r5
-; THUMBV6-NEXT:    mov r2, r6
+; THUMBV6-NEXT:    ldr r4, [sp, #44] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r4
 ; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
-; THUMBV6-NEXT:    mov r4, r1
-; THUMBV6-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r0, r1
-; THUMBV6-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; THUMBV6-NEXT:    adcs r4, r7
-; THUMBV6-NEXT:    ldr r0, [sp, #64] @ 4-byte Reload
-; THUMBV6-NEXT:    mov r1, r6
-; THUMBV6-NEXT:    ldr r2, [sp, #56] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r3, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    str r0, [sp, #28] @ 4-byte Spill
+; THUMBV6-NEXT:    adcs r1, r6
+; THUMBV6-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; THUMBV6-NEXT:    ldr r0, [sp, #48] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r1, r4
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    mov r6, r0
-; THUMBV6-NEXT:    mov r7, r1
-; THUMBV6-NEXT:    ldr r0, [sp, #80] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r2, [sp, #72] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r3, [sp, #76] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r4, r1
+; THUMBV6-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
+; THUMBV6-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
+; THUMBV6-NEXT:    mov r2, r5
+; THUMBV6-NEXT:    mov r3, r5
 ; THUMBV6-NEXT:    bl __aeabi_lmul
 ; THUMBV6-NEXT:    adds r0, r0, r6
-; THUMBV6-NEXT:    adcs r1, r7
-; THUMBV6-NEXT:    ldr r2, [sp, #36] @ 4-byte Reload
-; THUMBV6-NEXT:    adds r0, r2, r0
 ; THUMBV6-NEXT:    adcs r1, r4
-; THUMBV6-NEXT:  .LBB0_8: @ %overflow.res
-; THUMBV6-NEXT:    ldr r2, [sp, #48] @ 4-byte Reload
-; THUMBV6-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
-; THUMBV6-NEXT:    str r3, [r2]
-; THUMBV6-NEXT:    ldr r3, [sp, #40] @ 4-byte Reload
-; THUMBV6-NEXT:    str r3, [r2, #4]
+; THUMBV6-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r2, r0
+; THUMBV6-NEXT:    ldr r2, [sp, #24] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; THUMBV6-NEXT:    adds r0, r0, r2
+; THUMBV6-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
 ; THUMBV6-NEXT:    str r0, [r2, #8]
+; THUMBV6-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; THUMBV6-NEXT:    adcs r1, r0
 ; THUMBV6-NEXT:    str r1, [r2, #12]
+; THUMBV6-NEXT:    adcs r5, r5
+; THUMBV6-NEXT:    orrs r5, r7
 ; THUMBV6-NEXT:    movs r0, #1
 ; THUMBV6-NEXT:    ands r0, r5
 ; THUMBV6-NEXT:    strb r0, [r2, #16]
-; THUMBV6-NEXT:    add sp, #84
+; THUMBV6-NEXT:    add sp, #60
 ; THUMBV6-NEXT:    pop {r4, r5, r6, r7, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
index 07cd9788d91e1..fe1d06cb39e16 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-128-legalisation-lowering.ll
@@ -3,211 +3,125 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; THUMBV7-LABEL: muloti_test:
-; THUMBV7:       @ %bb.0: @ %overflow.entry
+; THUMBV7:       @ %bb.0: @ %start
 ; THUMBV7-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; THUMBV7-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; THUMBV7-NEXT:    .pad #12
-; THUMBV7-NEXT:    sub sp, #12
-; THUMBV7-NEXT:    ldrd r11, r6, [sp, #48]
-; THUMBV7-NEXT:    ldrd r10, r5, [sp, #64]
-; THUMBV7-NEXT:    ldrd r9, r12, [sp, #56]
-; THUMBV7-NEXT:    orrs.w r1, r11, r6
-; THUMBV7-NEXT:    beq .LBB0_3
-; THUMBV7-NEXT:  @ %bb.1: @ %overflow.lhs
-; THUMBV7-NEXT:    orr.w r4, r10, r5
-; THUMBV7-NEXT:    cmp r4, #0
-; THUMBV7-NEXT:    beq.w .LBB0_5
-; THUMBV7-NEXT:  @ %bb.2: @ %overflow
+; THUMBV7-NEXT:    .pad #44
+; THUMBV7-NEXT:    sub sp, #44
+; THUMBV7-NEXT:    ldr.w r8, [sp, #88]
+; THUMBV7-NEXT:    mov r9, r0
+; THUMBV7-NEXT:    ldr r7, [sp, #96]
+; THUMBV7-NEXT:    ldr.w lr, [sp, #100]
+; THUMBV7-NEXT:    umull r0, r5, r2, r8
+; THUMBV7-NEXT:    ldr r4, [sp, #80]
+; THUMBV7-NEXT:    str r0, [sp, #32] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r1, r0, r3, r7
+; THUMBV7-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r0, r11, lr, r2
+; THUMBV7-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; THUMBV7-NEXT:    ldr r1, [sp, #92]
+; THUMBV7-NEXT:    str r0, [sp] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r0, r10, r7, r2
+; THUMBV7-NEXT:    mov r7, r1
+; THUMBV7-NEXT:    umull r6, r12, r1, r4
+; THUMBV7-NEXT:    str r0, [sp, #40] @ 4-byte Spill
+; THUMBV7-NEXT:    ldr r0, [sp, #84]
+; THUMBV7-NEXT:    str r6, [sp, #24] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r6, r1, r0, r8
+; THUMBV7-NEXT:    str r6, [sp, #16] @ 4-byte Spill
+; THUMBV7-NEXT:    umull r6, r2, r2, r7
+; THUMBV7-NEXT:    mov r7, r4
+; THUMBV7-NEXT:    strd r6, r2, [sp, #8] @ 8-byte Folded Spill
+; THUMBV7-NEXT:    umull r2, r6, r4, r8
+; THUMBV7-NEXT:    str r2, [sp, #36] @ 4-byte Spill
+; THUMBV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; THUMBV7-NEXT:    str r6, [sp, #28] @ 4-byte Spill
+; THUMBV7-NEXT:    movs r6, #0
+; THUMBV7-NEXT:    str.w r2, [r9]
+; THUMBV7-NEXT:    umlal r5, r6, r3, r8
+; THUMBV7-NEXT:    ldr r2, [sp, #20] @ 4-byte Reload
+; THUMBV7-NEXT:    ldr r4, [sp] @ 4-byte Reload
+; THUMBV7-NEXT:    add r4, r2
+; THUMBV7-NEXT:    adds.w r2, r10, r4
+; THUMBV7-NEXT:    str r2, [sp, #20] @ 4-byte Spill
+; THUMBV7-NEXT:    mov.w r2, #0
+; THUMBV7-NEXT:    adc r2, r2, #0
+; THUMBV7-NEXT:    cmp.w r12, #0
+; THUMBV7-NEXT:    str r2, [sp, #32] @ 4-byte Spill
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    movne.w r12, #1
 ; THUMBV7-NEXT:    cmp r1, #0
+; THUMBV7-NEXT:    ldr r2, [sp, #96]
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
-; THUMBV7-NEXT:    and.w lr, r1, r4
-; THUMBV7-NEXT:    umull r7, r4, r6, r9
-; THUMBV7-NEXT:    cmp.w r12, #0
-; THUMBV7-NEXT:    mov r1, r12
-; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r1, #1
-; THUMBV7-NEXT:    cmp r6, #0
+; THUMBV7-NEXT:    orrs.w r10, r7, r0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r6, #1
-; THUMBV7-NEXT:    ands r1, r6
-; THUMBV7-NEXT:    cmp r4, #0
+; THUMBV7-NEXT:    movne.w r10, #1
+; THUMBV7-NEXT:    orrs.w r7, r2, lr
+; THUMBV7-NEXT:    ldr r2, [sp, #92]
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r4, #1
-; THUMBV7-NEXT:    orrs r1, r4
-; THUMBV7-NEXT:    umull r4, r6, r12, r11
-; THUMBV7-NEXT:    cmp r6, #0
+; THUMBV7-NEXT:    movne r7, #1
+; THUMBV7-NEXT:    cmp r0, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r6, #1
-; THUMBV7-NEXT:    orrs r6, r1
-; THUMBV7-NEXT:    adds r1, r7, r4
-; THUMBV7-NEXT:    umull r11, r4, r11, r9
-; THUMBV7-NEXT:    adds.w r8, r4, r1
-; THUMBV7-NEXT:    mov.w r1, #0
-; THUMBV7-NEXT:    adc r4, r1, #0
-; THUMBV7-NEXT:    cmp r3, #0
-; THUMBV7-NEXT:    orr.w r4, r4, r6
-; THUMBV7-NEXT:    umull r7, r6, r5, r2
-; THUMBV7-NEXT:    orr.w lr, lr, r4
-; THUMBV7-NEXT:    mov r4, r3
+; THUMBV7-NEXT:    movne r0, #1
+; THUMBV7-NEXT:    cmp r2, #0
+; THUMBV7-NEXT:    mov r4, r2
+; THUMBV7-NEXT:    mov r8, r2
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r4, #1
-; THUMBV7-NEXT:    cmp r5, #0
+; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
+; THUMBV7-NEXT:    ands r0, r4
+; THUMBV7-NEXT:    movs r4, #0
+; THUMBV7-NEXT:    adds r5, r5, r2
+; THUMBV7-NEXT:    str.w r5, [r9, #4]
+; THUMBV7-NEXT:    orr.w r0, r0, r1
+; THUMBV7-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; THUMBV7-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
+; THUMBV7-NEXT:    and.w r5, r10, r7
+; THUMBV7-NEXT:    orr.w r0, r0, r12
+; THUMBV7-NEXT:    mov.w r12, #0
+; THUMBV7-NEXT:    add r1, r2
+; THUMBV7-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r2, r6
+; THUMBV7-NEXT:    ldr r6, [sp, #28] @ 4-byte Reload
+; THUMBV7-NEXT:    adc r7, r4, #0
+; THUMBV7-NEXT:    adds r1, r1, r6
+; THUMBV7-NEXT:    umlal r2, r7, r3, r8
+; THUMBV7-NEXT:    adc r4, r4, #0
+; THUMBV7-NEXT:    orrs r0, r4
+; THUMBV7-NEXT:    orrs r0, r5
+; THUMBV7-NEXT:    ldrd r5, r4, [sp, #36] @ 8-byte Folded Reload
+; THUMBV7-NEXT:    adds r5, r5, r4
+; THUMBV7-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
+; THUMBV7-NEXT:    adcs r1, r4
+; THUMBV7-NEXT:    ldr r4, [sp, #4] @ 4-byte Reload
+; THUMBV7-NEXT:    cmp r4, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r5, #1
-; THUMBV7-NEXT:    ands r4, r5
-; THUMBV7-NEXT:    cmp r6, #0
+; THUMBV7-NEXT:    movne r4, #1
+; THUMBV7-NEXT:    cmp r3, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r6, #1
-; THUMBV7-NEXT:    orrs r4, r6
-; THUMBV7-NEXT:    umull r5, r6, r3, r10
-; THUMBV7-NEXT:    cmp r6, #0
+; THUMBV7-NEXT:    movne r3, #1
+; THUMBV7-NEXT:    cmp.w lr, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r6, #1
-; THUMBV7-NEXT:    orrs r4, r6
-; THUMBV7-NEXT:    add r5, r7
-; THUMBV7-NEXT:    umull r6, r7, r10, r2
-; THUMBV7-NEXT:    adds r5, r5, r7
-; THUMBV7-NEXT:    adc r7, r1, #0
-; THUMBV7-NEXT:    adds.w r6, r6, r11
-; THUMBV7-NEXT:    orr.w r4, r4, r7
-; THUMBV7-NEXT:    mov.w r7, #0
-; THUMBV7-NEXT:    orr.w lr, lr, r4
-; THUMBV7-NEXT:    umull r11, r4, r2, r9
-; THUMBV7-NEXT:    adc.w r10, r8, r5
-; THUMBV7-NEXT:    umlal r4, r7, r3, r9
-; THUMBV7-NEXT:    umull r2, r5, r2, r12
-; THUMBV7-NEXT:    adds.w r8, r2, r4
-; THUMBV7-NEXT:    adcs.w r2, r7, r5
-; THUMBV7-NEXT:    adc r4, r1, #0
-; THUMBV7-NEXT:    umlal r2, r4, r3, r12
-; THUMBV7-NEXT:    adds r2, r2, r6
-; THUMBV7-NEXT:    adcs.w r3, r4, r10
-; THUMBV7-NEXT:    adc r1, r1, #0
-; THUMBV7-NEXT:    orr.w r1, r1, lr
-; THUMBV7-NEXT:    b .LBB0_8
-; THUMBV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
-; THUMBV7-NEXT:    orrs.w r1, r10, r5
-; THUMBV7-NEXT:    beq.w .LBB0_7
-; THUMBV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
-; THUMBV7-NEXT:    umull r1, lr, r2, r10
-; THUMBV7-NEXT:    movs r7, #0
-; THUMBV7-NEXT:    umlal lr, r7, r3, r10
-; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r4, r8, r2, r5
-; THUMBV7-NEXT:    adds.w r1, r4, lr
-; THUMBV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; THUMBV7-NEXT:    adcs.w r7, r7, r8
-; THUMBV7-NEXT:    mov.w r1, #0
-; THUMBV7-NEXT:    adc lr, r1, #0
-; THUMBV7-NEXT:    umull r8, r1, r10, r11
-; THUMBV7-NEXT:    mla r1, r10, r6, r1
-; THUMBV7-NEXT:    umlal r7, lr, r3, r5
-; THUMBV7-NEXT:    mla r1, r5, r11, r1
-; THUMBV7-NEXT:    adds.w r5, r7, r8
-; THUMBV7-NEXT:    umull r4, r7, r2, r9
-; THUMBV7-NEXT:    adc.w r10, lr, r1
-; THUMBV7-NEXT:    movs r1, #0
-; THUMBV7-NEXT:    umlal r7, r1, r3, r9
-; THUMBV7-NEXT:    umull r2, lr, r2, r12
-; THUMBV7-NEXT:    adds.w r8, r2, r7
-; THUMBV7-NEXT:    mov.w r2, #0
-; THUMBV7-NEXT:    adcs.w r1, r1, lr
-; THUMBV7-NEXT:    adc r2, r2, #0
-; THUMBV7-NEXT:    umlal r1, r2, r3, r12
-; THUMBV7-NEXT:    umull r3, r7, r9, r11
-; THUMBV7-NEXT:    mla r7, r9, r6, r7
-; THUMBV7-NEXT:    adds r1, r1, r3
-; THUMBV7-NEXT:    mla r7, r12, r11, r7
-; THUMBV7-NEXT:    mov r11, r4
-; THUMBV7-NEXT:    adc.w r3, r2, r7
-; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; THUMBV7-NEXT:    adds r2, r2, r1
-; THUMBV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; THUMBV7-NEXT:    adcs r3, r1
-; THUMBV7-NEXT:    adcs r1, r5, #0
-; THUMBV7-NEXT:    adc r7, r10, #0
-; THUMBV7-NEXT:    b .LBB0_6
-; THUMBV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
-; THUMBV7-NEXT:    umull r1, r4, r9, r11
-; THUMBV7-NEXT:    movs r7, #0
-; THUMBV7-NEXT:    mov.w r8, #0
-; THUMBV7-NEXT:    umlal r4, r7, r12, r11
-; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; THUMBV7-NEXT:    umull r1, lr, r9, r6
-; THUMBV7-NEXT:    adds r1, r1, r4
-; THUMBV7-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; THUMBV7-NEXT:    adcs.w r7, r7, lr
-; THUMBV7-NEXT:    umull lr, r1, r11, r10
-; THUMBV7-NEXT:    adc r4, r8, #0
-; THUMBV7-NEXT:    mla r1, r11, r5, r1
-; THUMBV7-NEXT:    umlal r7, r4, r12, r6
-; THUMBV7-NEXT:    mla r1, r6, r10, r1
-; THUMBV7-NEXT:    adds.w r7, r7, lr
-; THUMBV7-NEXT:    str r7, [sp] @ 4-byte Spill
-; THUMBV7-NEXT:    mov.w r7, #0
-; THUMBV7-NEXT:    adc.w r11, r4, r1
-; THUMBV7-NEXT:    umull lr, r4, r9, r2
-; THUMBV7-NEXT:    umlal r4, r7, r12, r2
-; THUMBV7-NEXT:    umull r1, r9, r9, r3
-; THUMBV7-NEXT:    adds.w r8, r1, r4
-; THUMBV7-NEXT:    mov.w r4, #0
-; THUMBV7-NEXT:    adcs.w r1, r7, r9
-; THUMBV7-NEXT:    umull r7, r6, r2, r10
-; THUMBV7-NEXT:    adc r4, r4, #0
-; THUMBV7-NEXT:    mla r2, r2, r5, r6
-; THUMBV7-NEXT:    umlal r1, r4, r12, r3
-; THUMBV7-NEXT:    mla r2, r3, r10, r2
-; THUMBV7-NEXT:    adds r1, r1, r7
-; THUMBV7-NEXT:    adc.w r3, r4, r2
-; THUMBV7-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
-; THUMBV7-NEXT:    adds r2, r2, r1
-; THUMBV7-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
-; THUMBV7-NEXT:    adcs r3, r1
-; THUMBV7-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; THUMBV7-NEXT:    adcs r1, r1, #0
-; THUMBV7-NEXT:    adc r7, r11, #0
-; THUMBV7-NEXT:    mov r11, lr
-; THUMBV7-NEXT:  .LBB0_6: @ %overflow.res
-; THUMBV7-NEXT:    orrs r1, r7
+; THUMBV7-NEXT:    movne.w lr, #1
+; THUMBV7-NEXT:    cmp.w r11, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r1, #1
-; THUMBV7-NEXT:    b .LBB0_8
-; THUMBV7-NEXT:  .LBB0_7: @ %overflow.no
-; THUMBV7-NEXT:    umull r1, lr, r2, r9
-; THUMBV7-NEXT:    movs r4, #0
-; THUMBV7-NEXT:    umlal lr, r4, r3, r9
-; THUMBV7-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; THUMBV7-NEXT:    movs r1, #0
-; THUMBV7-NEXT:    umull r7, r8, r2, r12
-; THUMBV7-NEXT:    adds.w r7, r7, lr
-; THUMBV7-NEXT:    str r7, [sp] @ 4-byte Spill
-; THUMBV7-NEXT:    adcs.w r7, r4, r8
-; THUMBV7-NEXT:    ldr r4, [sp, #60]
-; THUMBV7-NEXT:    adc r8, r1, #0
-; THUMBV7-NEXT:    umlal r7, r8, r3, r12
-; THUMBV7-NEXT:    umull r12, lr, r9, r11
-; THUMBV7-NEXT:    mla r6, r9, r6, lr
-; THUMBV7-NEXT:    str.w r12, [sp, #4] @ 4-byte Spill
-; THUMBV7-NEXT:    mla r12, r4, r11, r6
-; THUMBV7-NEXT:    ldr.w r11, [sp, #8] @ 4-byte Reload
-; THUMBV7-NEXT:    umull lr, r6, r10, r2
-; THUMBV7-NEXT:    mla r3, r10, r3, r6
-; THUMBV7-NEXT:    mla r2, r5, r2, r3
-; THUMBV7-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
-; THUMBV7-NEXT:    adds.w r3, r3, lr
-; THUMBV7-NEXT:    adc.w r6, r2, r12
-; THUMBV7-NEXT:    adds r2, r7, r3
-; THUMBV7-NEXT:    adc.w r3, r8, r6
-; THUMBV7-NEXT:    ldr.w r8, [sp] @ 4-byte Reload
-; THUMBV7-NEXT:  .LBB0_8: @ %overflow.res
-; THUMBV7-NEXT:    strd r11, r8, [r0]
-; THUMBV7-NEXT:    and r1, r1, #1
-; THUMBV7-NEXT:    strd r2, r3, [r0, #8]
-; THUMBV7-NEXT:    strb r1, [r0, #16]
-; THUMBV7-NEXT:    add sp, #12
+; THUMBV7-NEXT:    movne.w r11, #1
+; THUMBV7-NEXT:    adds r2, r2, r5
+; THUMBV7-NEXT:    and.w r3, r3, lr
+; THUMBV7-NEXT:    str.w r2, [r9, #8]
+; THUMBV7-NEXT:    adcs r1, r7
+; THUMBV7-NEXT:    str.w r1, [r9, #12]
+; THUMBV7-NEXT:    orr.w r1, r3, r11
+; THUMBV7-NEXT:    ldr r2, [sp, #32] @ 4-byte Reload
+; THUMBV7-NEXT:    orr.w r1, r1, r4
+; THUMBV7-NEXT:    orr.w r1, r1, r2
+; THUMBV7-NEXT:    orr.w r0, r0, r1
+; THUMBV7-NEXT:    adc r1, r12, #0
+; THUMBV7-NEXT:    orrs r0, r1
+; THUMBV7-NEXT:    and r0, r0, #1
+; THUMBV7-NEXT:    strb.w r0, [r9, #16]
+; THUMBV7-NEXT:    add sp, #44
 ; THUMBV7-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
diff --git a/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
index 997868766d1dd..55e917159fce9 100644
--- a/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/Thumb2/umulo-64-legalisation-lowering.ll
@@ -3,19 +3,15 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-LABEL: mulodi_test:
-; THUMBV7:       @ %bb.0: @ %overflow.entry
+; THUMBV7:       @ %bb.0: @ %start
 ; THUMBV7-NEXT:    .save {r4, r5, r7, lr}
 ; THUMBV7-NEXT:    push {r4, r5, r7, lr}
-; THUMBV7-NEXT:    cbz r1, .LBB0_3
-; THUMBV7-NEXT:  @ %bb.1: @ %overflow.lhs
-; THUMBV7-NEXT:    cbz r3, .LBB0_5
-; THUMBV7-NEXT:  @ %bb.2: @ %overflow
-; THUMBV7-NEXT:    umull lr, r4, r3, r0
+; THUMBV7-NEXT:    umull r12, lr, r3, r0
 ; THUMBV7-NEXT:    cmp r3, #0
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r3, #1
 ; THUMBV7-NEXT:    cmp r1, #0
-; THUMBV7-NEXT:    umull r0, r12, r0, r2
+; THUMBV7-NEXT:    umull r0, r4, r0, r2
 ; THUMBV7-NEXT:    umull r2, r5, r1, r2
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r1, #1
@@ -24,44 +20,15 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; THUMBV7-NEXT:    it ne
 ; THUMBV7-NEXT:    movne r5, #1
 ; THUMBV7-NEXT:    orrs r1, r5
-; THUMBV7-NEXT:    cmp r4, #0
+; THUMBV7-NEXT:    cmp.w lr, #0
 ; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne r4, #1
-; THUMBV7-NEXT:    orr.w r3, r1, r4
-; THUMBV7-NEXT:    add.w r1, r2, lr
+; THUMBV7-NEXT:    movne.w lr, #1
+; THUMBV7-NEXT:    orr.w r3, r1, lr
+; THUMBV7-NEXT:    add.w r1, r2, r12
 ; THUMBV7-NEXT:    movs r2, #0
-; THUMBV7-NEXT:    adds.w r1, r1, r12
+; THUMBV7-NEXT:    adds r1, r1, r4
 ; THUMBV7-NEXT:    adc r2, r2, #0
-; THUMBV7-NEXT:    orr.w r12, r3, r2
-; THUMBV7-NEXT:    and r2, r12, #1
-; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
-; THUMBV7-NEXT:  .LBB0_3: @ %overflow.no.lhs
-; THUMBV7-NEXT:    mov r5, r0
-; THUMBV7-NEXT:    umull r0, r4, r0, r2
-; THUMBV7-NEXT:    cbz r3, .LBB0_7
-; THUMBV7-NEXT:  @ %bb.4: @ %overflow.no.lhs.only
-; THUMBV7-NEXT:    mul r12, r1, r3
-; THUMBV7-NEXT:    mla r1, r1, r2, r4
-; THUMBV7-NEXT:    umlal r1, r12, r5, r3
-; THUMBV7-NEXT:    b .LBB0_6
-; THUMBV7-NEXT:  .LBB0_5: @ %overflow.no.rhs.only
-; THUMBV7-NEXT:    mov lr, r0
-; THUMBV7-NEXT:    umull r0, r4, r2, r0
-; THUMBV7-NEXT:    mov r5, r1
-; THUMBV7-NEXT:    mul r12, r3, r1
-; THUMBV7-NEXT:    mla r1, r3, lr, r4
-; THUMBV7-NEXT:    umlal r1, r12, r2, r5
-; THUMBV7-NEXT:  .LBB0_6: @ %overflow.res
-; THUMBV7-NEXT:    cmp.w r12, #0
-; THUMBV7-NEXT:    it ne
-; THUMBV7-NEXT:    movne.w r12, #1
-; THUMBV7-NEXT:    and r2, r12, #1
-; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
-; THUMBV7-NEXT:  .LBB0_7: @ %overflow.no
-; THUMBV7-NEXT:    mla r3, r5, r3, r4
-; THUMBV7-NEXT:    mov.w r12, #0
-; THUMBV7-NEXT:    mla r1, r1, r2, r3
-; THUMBV7-NEXT:    and r2, r12, #1
+; THUMBV7-NEXT:    orrs r2, r3
 ; THUMBV7-NEXT:    pop {r4, r5, r7, pc}
 start:
   %0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %l, i64 %r) #2
diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll
index 2d236cce94c30..e101c702e6409 100644
--- a/llvm/test/CodeGen/X86/muloti.ll
+++ b/llvm/test/CodeGen/X86/muloti.ll
@@ -6,181 +6,60 @@
 ; This used to call muloti4, but that won't link with libgcc.
 define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp {
 ; CHECK-LABEL: x:
-; CHECK:       ## %bb.0: ## %overflow.entry
+; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    pushq %r14
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 24
 ; CHECK-NEXT:    .cfi_offset %rbx, -24
 ; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    movq %rdi, %r8
-; CHECK-NEXT:    sarq $63, %r8
-; CHECK-NEXT:    cmpq %r8, %rsi
-; CHECK-NEXT:    je LBB0_5
-; CHECK-NEXT:  ## %bb.1: ## %overflow.lhs
-; CHECK-NEXT:    cmpq %rax, %rcx
-; CHECK-NEXT:    je LBB0_2
-; CHECK-NEXT:  ## %bb.7: ## %overflow1
+; CHECK-NEXT:    movq %rdx, %r9
+; CHECK-NEXT:    movq %rsi, %r8
 ; CHECK-NEXT:    movq %rsi, %rbx
 ; CHECK-NEXT:    sarq $63, %rbx
 ; CHECK-NEXT:    imulq %rdx, %rbx
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdx, %r10
 ; CHECK-NEXT:    mulq %rdx
-; CHECK-NEXT:    movq %rdx, %r9
-; CHECK-NEXT:    movq %rax, %r8
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    mulq %r10
 ; CHECK-NEXT:    movq %rdx, %r10
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    movq %r8, %rax
+; CHECK-NEXT:    mulq %r9
+; CHECK-NEXT:    movq %rdx, %r9
 ; CHECK-NEXT:    movq %rax, %r11
-; CHECK-NEXT:    addq %r9, %r11
-; CHECK-NEXT:    adcq %rbx, %r10
-; CHECK-NEXT:    movq %r10, %rbx
+; CHECK-NEXT:    addq %r10, %r11
+; CHECK-NEXT:    adcq %rbx, %r9
+; CHECK-NEXT:    movq %r9, %rbx
 ; CHECK-NEXT:    sarq $63, %rbx
-; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    movq %rdi, %r14
-; CHECK-NEXT:    imulq %rax, %r14
+; CHECK-NEXT:    movq %rcx, %r14
+; CHECK-NEXT:    sarq $63, %r14
+; CHECK-NEXT:    imulq %rdi, %r14
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    movq %rdx, %r9
+; CHECK-NEXT:    movq %rdx, %r10
 ; CHECK-NEXT:    movq %rax, %rdi
 ; CHECK-NEXT:    addq %r11, %rdi
-; CHECK-NEXT:    adcq %r14, %r9
-; CHECK-NEXT:    movq %r9, %r11
+; CHECK-NEXT:    adcq %r14, %r10
+; CHECK-NEXT:    movq %r10, %r11
 ; CHECK-NEXT:    sarq $63, %r11
-; CHECK-NEXT:    addq %r10, %r9
+; CHECK-NEXT:    addq %r9, %r10
 ; CHECK-NEXT:    adcq %rbx, %r11
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    imulq %rcx
-; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    addq %r9, %rcx
+; CHECK-NEXT:    imulq %rcx
+; CHECK-NEXT:    addq %r10, %rax
 ; CHECK-NEXT:    adcq %r11, %rdx
-; CHECK-NEXT:    movq %rdi, %rsi
-; CHECK-NEXT:    sarq $63, %rdi
-; CHECK-NEXT:    xorq %rdi, %rdx
-; CHECK-NEXT:    xorq %rcx, %rdi
-; CHECK-NEXT:    orq %rdx, %rdi
-; CHECK-NEXT:    jmp LBB0_8
-; CHECK-NEXT:  LBB0_5: ## %overflow.no.lhs
-; CHECK-NEXT:    cmpq %rax, %rcx
-; CHECK-NEXT:    je LBB0_6
-; CHECK-NEXT:  ## %bb.4: ## %overflow.no.lhs.only
-; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    movq %rsi, %r9
-; CHECK-NEXT:    xorq %rax, %r9
-; CHECK-NEXT:    movq %rdi, %r8
-; CHECK-NEXT:    xorq %rax, %r8
-; CHECK-NEXT:    subq %rax, %r8
-; CHECK-NEXT:    sbbq %rax, %r9
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    sets %r10b
-; CHECK-NEXT:    cmovnsq %rsi, %r9
-; CHECK-NEXT:    cmovnsq %rdi, %r8
-; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    movq %rcx, %rsi
-; CHECK-NEXT:    xorq %rax, %rsi
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    xorq %rax, %rdi
-; CHECK-NEXT:    subq %rax, %rdi
-; CHECK-NEXT:    sbbq %rax, %rsi
-; CHECK-NEXT:    testq %rcx, %rcx
-; CHECK-NEXT:    sets %r11b
-; CHECK-NEXT:    cmovnsq %rcx, %rsi
-; CHECK-NEXT:    cmovnsq %rdx, %rdi
-; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    mulq %rdi
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    imulq %r9, %rdi
-; CHECK-NEXT:    addq %rdx, %rdi
-; CHECK-NEXT:    imulq %rsi, %r9
-; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    mulq %rsi
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    addq %rdi, %rsi
-; CHECK-NEXT:    adcq %r9, %rdx
-; CHECK-NEXT:    xorb %r10b, %r11b
-; CHECK-NEXT:    movzbl %r11b, %ecx
-; CHECK-NEXT:    jmp LBB0_3
-; CHECK-NEXT:  LBB0_2: ## %overflow.no.rhs.only
-; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    movq %rcx, %r9
-; CHECK-NEXT:    xorq %rax, %r9
-; CHECK-NEXT:    movq %rdx, %r8
-; CHECK-NEXT:    xorq %rax, %r8
-; CHECK-NEXT:    subq %rax, %r8
-; CHECK-NEXT:    sbbq %rax, %r9
-; CHECK-NEXT:    testq %rcx, %rcx
-; CHECK-NEXT:    sets %r10b
-; CHECK-NEXT:    cmovnsq %rcx, %r9
-; CHECK-NEXT:    cmovnsq %rdx, %r8
+; CHECK-NEXT:    movq %rdi, %rcx
+; CHECK-NEXT:    sarq $63, %rcx
+; CHECK-NEXT:    xorq %rcx, %rdx
+; CHECK-NEXT:    xorq %rax, %rcx
+; CHECK-NEXT:    orq %rdx, %rcx
+; CHECK-NEXT:    jne LBB0_1
+; CHECK-NEXT:  ## %bb.2: ## %nooverflow
 ; CHECK-NEXT:    movq %rsi, %rax
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    movq %rsi, %r14
-; CHECK-NEXT:    xorq %rax, %r14
-; CHECK-NEXT:    movq %rdi, %r11
-; CHECK-NEXT:    xorq %rax, %r11
-; CHECK-NEXT:    subq %rax, %r11
-; CHECK-NEXT:    sbbq %rax, %r14
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    sets %bl
-; CHECK-NEXT:    cmovnsq %rsi, %r14
-; CHECK-NEXT:    cmovnsq %rdi, %r11
-; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    mulq %r11
-; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    imulq %r9, %r11
-; CHECK-NEXT:    addq %rdx, %r11
-; CHECK-NEXT:    imulq %r14, %r9
-; CHECK-NEXT:    movq %r8, %rax
-; CHECK-NEXT:    mulq %r14
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    movq %rcx, %rax
-; CHECK-NEXT:    addq %r11, %rsi
-; CHECK-NEXT:    adcq %r9, %rdx
-; CHECK-NEXT:    xorb %r10b, %bl
-; CHECK-NEXT:    movzbl %bl, %ecx
-; CHECK-NEXT:  LBB0_3: ## %overflow.res
-; CHECK-NEXT:    movq %rcx, %rdi
-; CHECK-NEXT:    negq %rdi
-; CHECK-NEXT:    xorq %rdi, %rax
-; CHECK-NEXT:    addq %rcx, %rax
-; CHECK-NEXT:    xorl %r8d, %r8d
-; CHECK-NEXT:    cmpq %rcx, %rax
-; CHECK-NEXT:    setb %r8b
-; CHECK-NEXT:    xorq %rdi, %rsi
-; CHECK-NEXT:    addq %r8, %rsi
-; CHECK-NEXT:    xorq %rdx, %rdi
-; CHECK-NEXT:    cmpq %r8, %rsi
-; CHECK-NEXT:    adcq $0, %rdi
-; CHECK-NEXT:  LBB0_8: ## %overflow.res
-; CHECK-NEXT:    setne %cl
-; CHECK-NEXT:    testb $1, %cl
-; CHECK-NEXT:    jne LBB0_10
-; CHECK-NEXT:  LBB0_11: ## %nooverflow
-; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    movq %rdi, %rdx
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r14
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:  LBB0_6: ## %overflow.no
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq %rdx, %r8
-; CHECK-NEXT:    mulq %rdx
-; CHECK-NEXT:    imulq %rcx, %rdi
-; CHECK-NEXT:    addq %rdx, %rdi
-; CHECK-NEXT:    imulq %r8, %rsi
-; CHECK-NEXT:    addq %rdi, %rsi
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testb $1, %cl
-; CHECK-NEXT:    je LBB0_11
-; CHECK-NEXT:  LBB0_10: ## %overflow
+; CHECK-NEXT:  LBB0_1: ## %overflow
 ; CHECK-NEXT:    ud2
 entry:
   %tmp16 = zext i64 %a.coerce0 to i128
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 1460a2564cc3e..13596e1b18768 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -4,185 +4,64 @@
 
 define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X64-LABEL: smuloi128:
-; X64:       ## %bb.0: ## %overflow.entry
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64:       ## %bb.0:
 ; X64-NEXT:    pushq %r15
-; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    pushq %r14
-; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_def_cfa_offset 24
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 40
-; X64-NEXT:    .cfi_offset %rbx, -40
-; X64-NEXT:    .cfi_offset %r14, -32
-; X64-NEXT:    .cfi_offset %r15, -24
-; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %rdx, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rdi, %r9
-; X64-NEXT:    sarq $63, %r9
-; X64-NEXT:    cmpq %r9, %rsi
-; X64-NEXT:    je LBB0_5
-; X64-NEXT:  ## %bb.1: ## %overflow.lhs
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    je LBB0_2
-; X64-NEXT:  ## %bb.7: ## %overflow
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    .cfi_offset %rbx, -32
+; X64-NEXT:    .cfi_offset %r14, -24
+; X64-NEXT:    .cfi_offset %r15, -16
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rsi, %r9
 ; X64-NEXT:    movq %rsi, %r14
 ; X64-NEXT:    sarq $63, %r14
 ; X64-NEXT:    imulq %rdx, %r14
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    mulq %rdx
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r10, %rbx
-; X64-NEXT:    adcq %r14, %r11
-; X64-NEXT:    movq %r11, %r14
+; X64-NEXT:    addq %r11, %rbx
+; X64-NEXT:    adcq %r14, %r10
+; X64-NEXT:    movq %r10, %r14
 ; X64-NEXT:    sarq $63, %r14
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rdi, %r15
-; X64-NEXT:    imulq %rax, %r15
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    sarq $63, %r15
+; X64-NEXT:    imulq %rdi, %r15
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %rbx, %rdi
-; X64-NEXT:    adcq %r15, %r10
-; X64-NEXT:    movq %r10, %rbx
+; X64-NEXT:    adcq %r15, %r11
+; X64-NEXT:    movq %r11, %rbx
 ; X64-NEXT:    sarq $63, %rbx
-; X64-NEXT:    addq %r11, %r10
+; X64-NEXT:    addq %r10, %r11
 ; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    imulq %rcx
-; X64-NEXT:    addq %r10, %rax
+; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    adcq %rbx, %rdx
-; X64-NEXT:    movq %rdi, %rsi
+; X64-NEXT:    movq %rdi, 8(%r8)
 ; X64-NEXT:    sarq $63, %rdi
 ; X64-NEXT:    xorq %rdi, %rdx
 ; X64-NEXT:    xorq %rax, %rdi
 ; X64-NEXT:    orq %rdx, %rdi
-; X64-NEXT:    jmp LBB0_8
-; X64-NEXT:  LBB0_5: ## %overflow.no.lhs
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    je LBB0_6
-; X64-NEXT:  ## %bb.4: ## %overflow.no.lhs.only
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rsi, %r11
-; X64-NEXT:    xorq %rax, %r11
-; X64-NEXT:    movq %rdi, %r10
-; X64-NEXT:    xorq %rax, %r10
-; X64-NEXT:    subq %rax, %r10
-; X64-NEXT:    sbbq %rax, %r11
-; X64-NEXT:    testq %rsi, %rsi
-; X64-NEXT:    sets %bl
-; X64-NEXT:    cmovnsq %rsi, %r11
-; X64-NEXT:    cmovnsq %rdi, %r10
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    xorq %rax, %rsi
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    xorq %rax, %rdi
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    sbbq %rax, %rsi
-; X64-NEXT:    testq %rcx, %rcx
-; X64-NEXT:    sets %bpl
-; X64-NEXT:    cmovnsq %rcx, %rsi
-; X64-NEXT:    cmovnsq %rdx, %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    imulq %r11, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    imulq %rsi, %r11
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    jmp LBB0_3
-; X64-NEXT:  LBB0_2: ## %overflow.no.rhs.only
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rcx, %r11
-; X64-NEXT:    xorq %rax, %r11
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    xorq %rax, %r10
-; X64-NEXT:    subq %rax, %r10
-; X64-NEXT:    sbbq %rax, %r11
-; X64-NEXT:    testq %rcx, %rcx
-; X64-NEXT:    sets %bl
-; X64-NEXT:    cmovnsq %rcx, %r11
-; X64-NEXT:    cmovnsq %rdx, %r10
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rsi, %r14
-; X64-NEXT:    xorq %rax, %r14
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    xorq %rax, %rcx
-; X64-NEXT:    subq %rax, %rcx
-; X64-NEXT:    sbbq %rax, %r14
-; X64-NEXT:    testq %rsi, %rsi
-; X64-NEXT:    sets %bpl
-; X64-NEXT:    cmovnsq %rsi, %r14
-; X64-NEXT:    cmovnsq %rdi, %rcx
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    imulq %r11, %rcx
-; X64-NEXT:    addq %rdx, %rcx
-; X64-NEXT:    imulq %r14, %r11
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rsi
-; X64-NEXT:  LBB0_3: ## %overflow.res
-; X64-NEXT:    adcq %r11, %rdx
-; X64-NEXT:    xorb %bl, %bpl
-; X64-NEXT:    movzbl %bpl, %eax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    xorq %rcx, %r9
-; X64-NEXT:    addq %rax, %r9
-; X64-NEXT:    xorl %edi, %edi
-; X64-NEXT:    cmpq %rax, %r9
-; X64-NEXT:    setb %dil
-; X64-NEXT:    xorq %rcx, %rsi
-; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    xorq %rdx, %rcx
-; X64-NEXT:    cmpq %rdi, %rsi
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:  LBB0_8: ## %overflow.res
 ; X64-NEXT:    setne %al
-; X64-NEXT:    jmp LBB0_9
-; X64-NEXT:  LBB0_6: ## %overflow.no
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    mulq %rdx
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    imulq %rcx, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    imulq %r10, %rsi
-; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:  LBB0_9: ## %overflow.res
-; X64-NEXT:    movq %r9, (%r8)
-; X64-NEXT:    movq %rsi, 8(%r8)
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    ## kill: def $al killed $al killed $eax
+; X64-NEXT:    movq %rsi, (%r8)
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
-; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: smuloi128:
-; X86:       ## %bb.0: ## %overflow.entry
+; X86:       ## %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -191,212 +70,196 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $52, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 72
+; X86-NEXT:    subl $44, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 64
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    je LBB0_12
-; X86-NEXT:  ## %bb.1: ## %overflow.lhs
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    je LBB0_2
-; X86-NEXT:  ## %bb.14: ## %overflow
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %ebx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %edi
+; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %esi, %ebx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    imull %ecx, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    movzbl %bl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
 ; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    xorl %ecx, %eax
@@ -405,414 +268,17 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    xorl %ecx, %edx
 ; X86-NEXT:    xorl %ebx, %ecx
 ; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    jmp LBB0_15
-; X86-NEXT:  LBB0_12: ## %overflow.no.lhs
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    je LBB0_13
-; X86-NEXT:  ## %bb.7: ## %overflow.no.lhs.only
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    subl %eax, %ebp
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    js LBB0_9
-; X86-NEXT:  ## %bb.8: ## %overflow.no.lhs.only
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:  LBB0_9: ## %overflow.no.lhs.only
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    subl %eax, %ebx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    js LBB0_11
-; X86-NEXT:  ## %bb.10: ## %overflow.no.lhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:  LBB0_11: ## %overflow.no.lhs.only
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
-; X86-NEXT:    movzbl %cl, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    setb %dl
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    movzbl %dl, %edx
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    cmpl %edx, %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    jmp LBB0_15
-; X86-NEXT:  LBB0_2: ## %overflow.no.rhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    subl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    js LBB0_4
-; X86-NEXT:  ## %bb.3: ## %overflow.no.rhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:  LBB0_4: ## %overflow.no.rhs.only
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    subl %eax, %ebx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    js LBB0_6
-; X86-NEXT:  ## %bb.5: ## %overflow.no.rhs.only
-; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:  LBB0_6: ## %overflow.no.rhs.only
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    imull %edx, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    imull %ecx, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
-; X86-NEXT:    movzbl %cl, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    setb %dl
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    movzbl %dl, %edx
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    cmpl %edx, %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:  LBB0_15: ## %overflow.res
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    setne %al
-; X86-NEXT:    jmp LBB0_16
-; X86-NEXT:  LBB0_13: ## %overflow.no
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    imull %ecx, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    imull %ecx, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %esi, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  LBB0_16: ## %overflow.res
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, 4(%ecx)
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    ## kill: def $al killed $al killed $eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $44, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -827,7 +293,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 
 define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-LABEL: smuloi256:
-; X64:       ## %bb.0: ## %overflow.entry
+; X64:       ## %bb.0:
 ; X64-NEXT:    pushq %rbp
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    pushq %r15
@@ -846,558 +312,199 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %r8, %r15
-; X64-NEXT:    movq %rcx, %r12
+; X64-NEXT:    movq %r8, %r12
+; X64-NEXT:    movq %rcx, %rbx
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rsi, %r11
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT:    movq %r9, %rsi
-; X64-NEXT:    sarq $63, %rsi
-; X64-NEXT:    movq %r11, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    movq %r12, %rdx
-; X64-NEXT:    xorq %rcx, %rdx
-; X64-NEXT:    xorq %r8, %rcx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    je LBB1_4
-; X64-NEXT:  ## %bb.1: ## %overflow.lhs
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    xorq %rsi, %rcx
-; X64-NEXT:    xorq %rbx, %rsi
-; X64-NEXT:    orq %rcx, %rsi
-; X64-NEXT:    je LBB1_2
-; X64-NEXT:  ## %bb.6: ## %overflow
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rsi, %r10
+; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rsi, %r14
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rdi, %r13
-; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    addq %r14, %r13
+; X64-NEXT:    adcq %rcx, %rsi
 ; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %edi
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movzbl %al, %ecx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %rdx, %r8
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rsi, %r14
+; X64-NEXT:    adcq %rcx, %r8
+; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movq %r9, %rsi
+; X64-NEXT:    imulq %rcx, %rsi
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    addq %rax, %r15
+; X64-NEXT:    addq %rsi, %r15
+; X64-NEXT:    addq %rax, %r14
+; X64-NEXT:    adcq %r8, %r15
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rcx, %r10
-; X64-NEXT:    adcq %rdi, %rsi
-; X64-NEXT:    movq %r12, %rdx
-; X64-NEXT:    sarq $63, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r9, %rcx
-; X64-NEXT:    imulq %rdx, %rcx
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rdx
-; X64-NEXT:    movq %rdx, %rbp
-; X64-NEXT:    addq %rax, %rbp
-; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rcx, %r8
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r9
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    addq %r8, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rsi, %rdi
-; X64-NEXT:    setb %sil
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %rsi, %r12
+; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rdi, %rcx
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    adcq %rax, %rbx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r13, %rbx
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rsi, %r9
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r9, %r10
-; X64-NEXT:    adcq %r8, %rsi
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %rsi, %r8
-; X64-NEXT:    movzbl %r9b, %eax
-; X64-NEXT:    adcq %rax, %r13
-; X64-NEXT:    movq %r15, %rsi
-; X64-NEXT:    sarq $63, %rsi
-; X64-NEXT:    imulq %rsi, %r11
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    addq %r11, %r9
-; X64-NEXT:    addq %rax, %r9
-; X64-NEXT:    addq %rax, %r8
-; X64-NEXT:    adcq %r13, %r9
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq %rbx, %r10
-; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %rbp, %rcx
-; X64-NEXT:    sarq $63, %rcx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload
-; X64-NEXT:    adcq %rbp, %r9
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r15, %r11
-; X64-NEXT:    movq %r15, %rbp
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
-; X64-NEXT:    imulq %rcx, %r11
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    addq %rax, %rcx
-; X64-NEXT:    addq %r11, %rcx
-; X64-NEXT:    movq %r12, %r15
-; X64-NEXT:    imulq %rsi, %r12
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    addq %r12, %r11
-; X64-NEXT:    addq %rax, %r11
-; X64-NEXT:    addq %r14, %rbx
-; X64-NEXT:    adcq %rcx, %r11
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r12
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r13
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    addq %rsi, %r13
-; X64-NEXT:    adcq $0, %r14
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %r13, %rsi
-; X64-NEXT:    adcq %r14, %r12
-; X64-NEXT:    setb %r14b
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
-; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    addq %r12, %rax
-; X64-NEXT:    movzbl %r14b, %r14d
-; X64-NEXT:    adcq %r14, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    adcq %r11, %rdx
-; X64-NEXT:    addq %r8, %rcx
-; X64-NEXT:    adcq %r9, %rsi
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Folded Reload
-; X64-NEXT:    movq %r10, %r8
-; X64-NEXT:    sarq $63, %r8
-; X64-NEXT:    xorq %r8, %rax
-; X64-NEXT:    xorq %r8, %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    xorq %r8, %rdx
-; X64-NEXT:    xorq %rsi, %r8
-; X64-NEXT:    orq %rdx, %r8
-; X64-NEXT:    orq %rcx, %r8
-; X64-NEXT:    jmp LBB1_7
-; X64-NEXT:  LBB1_4: ## %overflow.no.lhs
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    xorq %rsi, %rcx
-; X64-NEXT:    xorq %rbx, %rsi
-; X64-NEXT:    orq %rcx, %rsi
-; X64-NEXT:    je LBB1_5
-; X64-NEXT:  ## %bb.3: ## %overflow.no.lhs.only
-; X64-NEXT:    movq %r12, %rsi
-; X64-NEXT:    sarq $63, %rsi
-; X64-NEXT:    movq %r12, %rcx
-; X64-NEXT:    xorq %rsi, %rcx
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq %r8, %rbp
-; X64-NEXT:    xorq %rsi, %rbp
-; X64-NEXT:    movq %r9, %rcx
-; X64-NEXT:    movq %r11, %r13
-; X64-NEXT:    xorq %rsi, %r13
-; X64-NEXT:    movq %rdi, %r10
-; X64-NEXT:    xorq %rsi, %r10
-; X64-NEXT:    subq %rsi, %r10
-; X64-NEXT:    sbbq %rsi, %r13
-; X64-NEXT:    sbbq %rsi, %rbp
-; X64-NEXT:    sbbq %rsi, %rdx
-; X64-NEXT:    testq %r12, %r12
-; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
-; X64-NEXT:    cmovnsq %r12, %rdx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    cmovnsq %r8, %rbp
-; X64-NEXT:    cmovnsq %r11, %r13
-; X64-NEXT:    cmovnsq %rdi, %r10
-; X64-NEXT:    movq %rbx, %rdx
-; X64-NEXT:    sarq $63, %rdx
-; X64-NEXT:    movq %rbx, %r12
-; X64-NEXT:    xorq %rdx, %r12
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    xorq %rdx, %r14
-; X64-NEXT:    xorq %rdx, %r9
-; X64-NEXT:    movq %r15, %r11
-; X64-NEXT:    xorq %rdx, %r11
-; X64-NEXT:    subq %rdx, %r11
-; X64-NEXT:    sbbq %rdx, %r9
-; X64-NEXT:    sbbq %rdx, %r14
-; X64-NEXT:    sbbq %rdx, %r12
-; X64-NEXT:    testq %rbx, %rbx
-; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
-; X64-NEXT:    cmovnsq %rbx, %r12
-; X64-NEXT:    cmovnsq %rax, %r14
-; X64-NEXT:    cmovnsq %rcx, %r9
-; X64-NEXT:    cmovnsq %r15, %r11
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r11
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    adcq %rdi, %rbx
+; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r8d
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq %r8, %rsi
-; X64-NEXT:    imulq %rbp, %r9
+; X64-NEXT:    movq %rdx, %rbp
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    addq %rbx, %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    adcq %rax, %rbp
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload
+; X64-NEXT:    adcq %r13, %rbp
+; X64-NEXT:    adcq $0, %r14
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq %r15, %r12
+; X64-NEXT:    sarq $63, %r12
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
-; X64-NEXT:    imulq %r15, %r11
-; X64-NEXT:    addq %rdx, %r11
-; X64-NEXT:    addq %r9, %r11
-; X64-NEXT:    addq %rdi, %rcx
-; X64-NEXT:    adcq %rsi, %r11
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    addq %r8, %r9
-; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %r9, %r10
-; X64-NEXT:    adcq %rsi, %r8
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ebx
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r8, %r9
-; X64-NEXT:    adcq %rbx, %rsi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    imulq %r15, %r14
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    imulq %rbp, %r12
-; X64-NEXT:    addq %r14, %r12
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
+; X64-NEXT:    addq %rdi, %r9
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    addq %r9, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT:    adcq %rsi, %r12
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq %r11, %r10
-; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X64-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %cl ## 1-byte Folded Reload
-; X64-NEXT:    movzbl %cl, %edx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 ## 8-byte Reload
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    xorq %rcx, %r15
-; X64-NEXT:    xorq %rcx, %r14
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    cmpq %rdx, %r14
-; X64-NEXT:    movq %r15, %rdx
-; X64-NEXT:    sbbq $0, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    xorq %rcx, %r10
-; X64-NEXT:    xorq %rcx, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    xorq %rcx, %r12
-; X64-NEXT:    xorq %rax, %rcx
-; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    adcq %r13, %rdi
+; X64-NEXT:    setb %r8b
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    orq %rcx, %r12
-; X64-NEXT:    setne %al
-; X64-NEXT:    jmp LBB1_8
-; X64-NEXT:  LBB1_2: ## %overflow.no.rhs.only
-; X64-NEXT:    movq %rbx, %rdx
-; X64-NEXT:    sarq $63, %rdx
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    xorq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    xorq %rdx, %r14
-; X64-NEXT:    movq %r9, %r13
-; X64-NEXT:    xorq %rdx, %r13
-; X64-NEXT:    movq %r15, %r10
-; X64-NEXT:    xorq %rdx, %r10
-; X64-NEXT:    subq %rdx, %r10
-; X64-NEXT:    sbbq %rdx, %r13
-; X64-NEXT:    sbbq %rdx, %r14
-; X64-NEXT:    sbbq %rdx, %rcx
-; X64-NEXT:    testq %rbx, %rbx
-; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
-; X64-NEXT:    cmovnsq %rbx, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    cmovnsq %rax, %r14
-; X64-NEXT:    cmovnsq %r9, %r13
-; X64-NEXT:    cmovnsq %r15, %r10
+; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %rdx, %r9
+; X64-NEXT:    movq %rax, %r13
+; X64-NEXT:    addq %rdi, %r13
+; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    adcq %rax, %r9
+; X64-NEXT:    movq %r11, %rdi
+; X64-NEXT:    movq %r11, %r8
+; X64-NEXT:    sarq $63, %rdi
+; X64-NEXT:    imulq %rdi, %r10
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload
+; X64-NEXT:    movq %rdx, %r11
+; X64-NEXT:    addq %r10, %r11
+; X64-NEXT:    addq %rax, %r11
+; X64-NEXT:    addq %rax, %r13
+; X64-NEXT:    adcq %r9, %r11
+; X64-NEXT:    addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Spill
+; X64-NEXT:    adcq %rbp, %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    adcq $0, %r13
+; X64-NEXT:    adcq $0, %r11
+; X64-NEXT:    movq %r11, %rbp
+; X64-NEXT:    sarq $63, %rbp
+; X64-NEXT:    addq %r14, %r13
+; X64-NEXT:    adcq %r15, %r11
 ; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    movq %r12, %rbp
-; X64-NEXT:    xorq %rax, %rbp
-; X64-NEXT:    movq %r12, %rsi
-; X64-NEXT:    movq %r8, %r12
-; X64-NEXT:    xorq %rax, %r12
-; X64-NEXT:    movq %r11, %rbx
-; X64-NEXT:    xorq %rax, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
-; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    xorq %rax, %rdi
-; X64-NEXT:    subq %rax, %rdi
-; X64-NEXT:    sbbq %rax, %rbx
-; X64-NEXT:    sbbq %rax, %r12
-; X64-NEXT:    sbbq %rax, %rbp
-; X64-NEXT:    testq %rsi, %rsi
-; X64-NEXT:    sets {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
-; X64-NEXT:    cmovnsq %rsi, %rbp
-; X64-NEXT:    cmovnsq %r8, %r12
-; X64-NEXT:    cmovnsq %r11, %rbx
-; X64-NEXT:    cmovnsq %rdx, %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    adcq %rbp, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rdi
+; X64-NEXT:    adcq %r12, %rbp
+; X64-NEXT:    movq %r8, %rbx
+; X64-NEXT:    imulq %rcx, %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r8
-; X64-NEXT:    addq %r9, %r8
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r9d
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    adcq %r9, %rsi
-; X64-NEXT:    imulq %r14, %rbx
+; X64-NEXT:    addq %rax, %rsi
+; X64-NEXT:    addq %r8, %rsi
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
+; X64-NEXT:    imulq %r12, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload
-; X64-NEXT:    addq %rbx, %r9
-; X64-NEXT:    addq %rdi, %r9
-; X64-NEXT:    addq %r11, %rcx
-; X64-NEXT:    adcq %rsi, %r9
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    addq %r11, %rbx
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    addq %rbx, %r10
-; X64-NEXT:    adcq %rsi, %r11
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %r15d
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %rbp
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %r11, %rbx
-; X64-NEXT:    adcq %r15, %rsi
-; X64-NEXT:    movq %r8, %r15
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload
-; X64-NEXT:    imulq %r14, %rbp
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
-; X64-NEXT:    addq %rdx, %rbp
-; X64-NEXT:    addq %r12, %rbp
-; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq %r9, %r10
-; X64-NEXT:    adcq $0, %rax
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X64-NEXT:    xorb {{[-0-9]+}}(%r{{[sb]}}p), %cl ## 1-byte Folded Reload
-; X64-NEXT:    movzbl %cl, %edx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    negq %rcx
-; X64-NEXT:    xorq %rcx, %r15
-; X64-NEXT:    xorq %rcx, %r14
-; X64-NEXT:    addq %rdx, %r14
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    cmpq %rdx, %r14
-; X64-NEXT:    movq %r15, %rdx
-; X64-NEXT:    sbbq $0, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    movzbl %dl, %edx
-; X64-NEXT:    xorq %rcx, %r10
-; X64-NEXT:    xorq %rcx, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    xorq %rcx, %rbp
-; X64-NEXT:    xorq %rax, %rcx
-; X64-NEXT:    cmpq %rdx, %rdi
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    sbbq $0, %rax
-; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    adcq $0, %rbp
-; X64-NEXT:    orq %rcx, %rbp
-; X64-NEXT:  LBB1_7: ## %overflow.res
-; X64-NEXT:    setne %al
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT:    jmp LBB1_8
-; X64-NEXT:  LBB1_5: ## %overflow.no
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    imulq %r11, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    addq %rcx, %rdx
-; X64-NEXT:    imulq %rdi, %rbx
-; X64-NEXT:    addq %rdx, %rbx
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    imulq %r15, %r12
-; X64-NEXT:    addq %rdx, %r12
-; X64-NEXT:    imulq %r9, %r8
-; X64-NEXT:    addq %r12, %r8
-; X64-NEXT:    addq %rsi, %rcx
-; X64-NEXT:    adcq %rbx, %r8
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r15
-; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    addq %rsi, %rbx
-; X64-NEXT:    adcq $0, %r10
+; X64-NEXT:    addq %rcx, %r10
+; X64-NEXT:    addq %rax, %r10
+; X64-NEXT:    addq %r9, %r14
+; X64-NEXT:    adcq %rsi, %r10
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rdi
+; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %rbx, %r15
-; X64-NEXT:    adcq %r10, %rsi
-; X64-NEXT:    setb %al
-; X64-NEXT:    movzbl %al, %ebx
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r9
-; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    addq %rdi, %r15
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    addq %rsi, %rdi
-; X64-NEXT:    adcq %rbx, %r10
-; X64-NEXT:    addq %rcx, %rdi
-; X64-NEXT:    adcq %r8, %r10
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:  LBB1_8: ## %overflow.res
-; X64-NEXT:    movq %r14, (%r13)
-; X64-NEXT:    movq %r15, 8(%r13)
-; X64-NEXT:    movq %rdi, 16(%r13)
-; X64-NEXT:    movq %r10, 24(%r13)
-; X64-NEXT:    andb $1, %al
-; X64-NEXT:    ## kill: def $al killed $al killed $eax
+; X64-NEXT:    addq %r15, %rdi
+; X64-NEXT:    adcq %r9, %r8
+; X64-NEXT:    setb %cl
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rbx
+; X64-NEXT:    addq %r8, %rax
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq %r14, %rax
+; X64-NEXT:    adcq %r10, %rdx
+; X64-NEXT:    addq %r13, %rsi
+; X64-NEXT:    adcq %r11, %rdi
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload
+; X64-NEXT:    adcq %rbp, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    xorq %rcx, %rax
+; X64-NEXT:    xorq %rcx, %rsi
+; X64-NEXT:    orq %rax, %rsi
+; X64-NEXT:    xorq %rcx, %rdx
+; X64-NEXT:    xorq %rdi, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    orq %rsi, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq %r8, 24(%rax)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; X64-NEXT:    movq %rcx, (%rax)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; X64-NEXT:    movq %rcx, 8(%rax)
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; X64-NEXT:    movq %rcx, 16(%rax)
+; X64-NEXT:    setne %al
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
@@ -1406,1703 +513,350 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
-; X86-LABEL: smuloi256:
-; X86:       ## %bb.0: ## %overflow.entry
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 148
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    je LBB1_12
-; X86-NEXT:  ## %bb.1: ## %overflow.lhs
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    je LBB1_2
-; X86-NEXT:  ## %bb.14: ## %overflow
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sarl $31, %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    adcl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    imull %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    jmp LBB1_15
-; X86-NEXT:  LBB1_12: ## %overflow.no.lhs
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    je LBB1_13
-; X86-NEXT:  ## %bb.7: ## %overflow.no.lhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    js LBB1_9
-; X86-NEXT:  ## %bb.8: ## %overflow.no.lhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:  LBB1_9: ## %overflow.no.lhs.only
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    js LBB1_11
-; X86-NEXT:  ## %bb.10: ## %overflow.no.lhs.only
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:  LBB1_11: ## %overflow.no.lhs.only
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl (%esp), %ebp ## 4-byte Reload
-; X86-NEXT:    imull %eax, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl (%esp), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    imull %esi, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    imull %ebx, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    imull %ecx, %edx
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
-; X86-NEXT:    movzbl %cl, %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl (%esp), %ebp ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    cmpl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ebx, %edi
-; X86-NEXT:    jmp LBB1_15
-; X86-NEXT:  LBB1_2: ## %overflow.no.rhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-LABEL: smuloi256:
+; X86:       ## %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 148
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    js LBB1_4
-; X86-NEXT:  ## %bb.3: ## %overflow.no.rhs.only
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ecx, %ebp
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:  LBB1_4: ## %overflow.no.rhs.only
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    xorl %eax, %ebp
-; X86-NEXT:    xorl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    sbbl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    js LBB1_6
-; X86-NEXT:  ## %bb.5: ## %overflow.no.rhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:  LBB1_6: ## %overflow.no.rhs.only
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %edi, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movzbl (%esp), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    imull %eax, %ebx
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
@@ -3112,277 +866,203 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl %ebp, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    sarl $31, %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edx, %edi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    imull %ecx, %edx
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    adcl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl ## 1-byte Folded Reload
-; X86-NEXT:    movzbl %cl, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    cmpl %ebx, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl (%esp), %ebx ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebx, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    xorl %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    cmpl %esi, %ebx
-; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl %edx, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ebp, %edi
-; X86-NEXT:  LBB1_15: ## %overflow.res
-; X86-NEXT:    setne %al
-; X86-NEXT:    jmp LBB1_16
-; X86-NEXT:  LBB1_13: ## %overflow.no
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
@@ -3393,60 +1073,58 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
@@ -3455,15 +1133,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -3472,151 +1150,136 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    imull %ecx, %edi
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    imull %ecx, %ebx
-; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    imull %edi, %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %edx
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl %edi, %ebx
+; X86-NEXT:    addl %edx, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %eax, %ebp
+; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  LBB1_16: ## %overflow.res
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, (%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, 4(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, 12(%ecx)
-; X86-NEXT:    movl %esi, 16(%ecx)
-; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, 24(%ecx)
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    movl %edx, 28(%ecx)
-; X86-NEXT:    andb $1, %al
-; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %ebx, %edi
+; X86-NEXT:    xorl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, 28(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, 20(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    movl %ecx, 24(%eax)
+; X86-NEXT:    setne %al
 ; X86-NEXT:    addl $128, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4ccb90a37ca71..4c3170304b980 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -4,19 +4,14 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-LABEL: muloti_test:
-; X64:       # %bb.0: # %overflow.entry
+; X64:       # %bb.0: # %start
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    testq %rsi, %rsi
-; X64-NEXT:    je .LBB0_3
-; X64-NEXT:  # %bb.1: # %overflow.lhs
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    testq %rcx, %rcx
-; X64-NEXT:    je .LBB0_7
-; X64-NEXT:  # %bb.2: # %overflow
-; X64-NEXT:    setne %al
+; X64-NEXT:    setne %dl
 ; X64-NEXT:    testq %rsi, %rsi
 ; X64-NEXT:    setne %r9b
-; X64-NEXT:    andb %al, %r9b
-; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    andb %dl, %r9b
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    seto %r10b
@@ -31,59 +26,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    orb %r11b, %cl
-; X64-NEXT:    andb $1, %cl
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB0_3: # %overflow.no.lhs
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
-; X64-NEXT:    testq %rcx, %rcx
-; X64-NEXT:    je .LBB0_8
-; X64-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    imulq %rsi, %r8
-; X64-NEXT:    addq %rdx, %r8
-; X64-NEXT:    imulq %rcx, %rsi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %rcx
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    addq %r8, %rdx
-; X64-NEXT:    adcq %rsi, %rcx
-; X64-NEXT:    jmp .LBB0_5
-; X64-NEXT:  .LBB0_7: # %overflow.no.rhs.only
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rdi
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    imulq %rcx, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    imulq %rsi, %rcx
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    adcq %rcx, %rsi
-; X64-NEXT:  .LBB0_5: # %overflow.no.lhs.only
-; X64-NEXT:    setne %cl
-; X64-NEXT:    andb $1, %cl
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB0_8: # %overflow.no
-; X64-NEXT:    imulq %rcx, %rdi
-; X64-NEXT:    addq %rdx, %rdi
-; X64-NEXT:    imulq %r8, %rsi
-; X64-NEXT:    addq %rdi, %rsi
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movq %rsi, %rdx
-; X64-NEXT:    andb $1, %cl
-; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: muloti_test:
-; X86:       # %bb.0: # %overflow.entry
+; X86:       # %bb.0: # %start
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -92,352 +38,116 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $36, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 56
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 44
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    je .LBB0_4
-; X86-NEXT:  # %bb.1: # %overflow.lhs
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    je .LBB0_2
-; X86-NEXT:  # %bb.6: # %overflow
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %esi
-; X86-NEXT:    leal (%edi,%eax), %ecx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    leal (%ecx,%eax), %esi
+; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    leal (%ebx,%eax), %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    leal (%esi,%eax), %esi
 ; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    setne %al
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    setne %ah
-; X86-NEXT:    andb %al, %ah
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setne %cl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setne %ch
+; X86-NEXT:    andb %cl, %ch
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    orb %ah, %cl
+; X86-NEXT:    orb %ch, %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    setne %al
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    setne %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    setne %ah
-; X86-NEXT:    andb %al, %ah
-; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    orb %ah, %al
-; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    setne %ch
-; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    setne %bl
-; X86-NEXT:    andb %ch, %bl
-; X86-NEXT:    orb %al, %bl
-; X86-NEXT:    orb %cl, %bl
+; X86-NEXT:    andb %cl, %ch
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X86-NEXT:    jmp .LBB0_7
-; X86-NEXT:  .LBB0_4: # %overflow.no.lhs
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    je .LBB0_5
-; X86-NEXT:  # %bb.3: # %overflow.no.lhs.only
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %ecx, %eax
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    orl %eax, %ebp
-; X86-NEXT:    setne %bl
-; X86-NEXT:    jmp .LBB0_7
-; X86-NEXT:  .LBB0_2: # %overflow.no.rhs.only
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edi
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    setne %bl
-; X86-NEXT:    jmp .LBB0_7
-; X86-NEXT:  .LBB0_5: # %overflow.no
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull %esi, %ebx
-; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edx, %ebp
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orb %ch, %bl
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setne %bh
+; X86-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    setne %al
+; X86-NEXT:    andb %bh, %al
+; X86-NEXT:    orb %bl, %al
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:  .LBB0_7: # %overflow.res
-; X86-NEXT:    andb $1, %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movb %bl, 16(%eax)
-; X86-NEXT:    addl $36, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 99dc422a6b53e..132683cdb0f9e 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -3,7 +3,7 @@
 
 define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-LABEL: mulodi_test:
-; X86:       # %bb.0: # %overflow.entry
+; X86:       # %bb.0: # %start
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx
@@ -12,89 +12,32 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    .cfi_def_cfa_offset 24
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    je .LBB0_4
-; X86-NEXT:  # %bb.1: # %overflow.lhs
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    je .LBB0_2
-; X86-NEXT:  # %bb.6: # %overflow
-; X86-NEXT:    setne %al
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    setne %dl
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    andb %al, %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %ch
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    orb %ch, %bl
-; X86-NEXT:    orb %cl, %bl
-; X86-NEXT:    leal (%edi,%eax), %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    orb %bl, %cl
-; X86-NEXT:    jmp .LBB0_7
-; X86-NEXT:  .LBB0_4: # %overflow.no.lhs
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    je .LBB0_5
-; X86-NEXT:  # %bb.3: # %overflow.no.lhs.only
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull %edi, %ebp
-; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    imull %ebx, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setne %cl
-; X86-NEXT:    jmp .LBB0_7
-; X86-NEXT:  .LBB0_2: # %overflow.no.rhs.only
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    seto %ch
+; X86-NEXT:    orb %bl, %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    leal (%edi,%eax), %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    setne %cl
-; X86-NEXT:    jmp .LBB0_7
-; X86-NEXT:  .LBB0_5: # %overflow.no
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    imull %ebp, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:  .LBB0_7: # %overflow.res
-; X86-NEXT:    andb $1, %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    addl $4, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    setb %cl
+; X86-NEXT:    orb %ch, %cl
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 2601b73f26822..a076d0d762aa3 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -13,7 +13,7 @@ define {i64, i1} @t1() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t1:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    movl $72, %eax
 ; WIN32-NEXT:    xorl %edx, %edx
 ; WIN32-NEXT:    xorl %ecx, %ecx
@@ -30,7 +30,7 @@ define {i64, i1} @t2() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t2:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    xorl %eax, %eax
 ; WIN32-NEXT:    xorl %edx, %edx
 ; WIN32-NEXT:    xorl %ecx, %ecx
@@ -47,7 +47,7 @@ define {i64, i1} @t3() nounwind {
 ; CHECK-NEXT:    retq
 ;
 ; WIN32-LABEL: t3:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    movl $-9, %eax
 ; WIN32-NEXT:    movl $-1, %edx
 ; WIN32-NEXT:    movb $1, %cl
@@ -204,207 +204,59 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $16, %esp
+; WIN32-NEXT:    subl $8, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    sarl $31, %edx
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    subl %edx, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %ebx, %edx
-; WIN32-NEXT:    je LBB6_13
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB6_2
-; WIN32-NEXT:  # %bb.15: # %overflow
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    imull %esi, %ebp
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %esi
+; WIN32-NEXT:    imull %ebx, %esi
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    addl %edi, %eax
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    adcl %esi, %ebx
+; WIN32-NEXT:    movl %ebx, %edi
 ; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; WIN32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    imull %ebx
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; WIN32-NEXT:    adcl %edi, %edx
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ecx
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl %edi, %ecx
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    jmp LBB6_16
-; WIN32-NEXT:  LBB6_13: # %overflow.no.lhs
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB6_14
-; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    subl %eax, %esi
-; WIN32-NEXT:    sbbl %eax, %ebp
-; WIN32-NEXT:    testl %ecx, %ecx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB6_9
-; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:  LBB6_9: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %ecx
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB6_11
-; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:  LBB6_11: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %ecx, %ebp
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %edi, %ecx
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; WIN32-NEXT:    jmp LBB6_12
-; WIN32-NEXT:  LBB6_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %ebp
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB6_4
-; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:  LBB6_4: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edi, %ebx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    subl %eax, %esi
-; WIN32-NEXT:    sbbl %eax, %edi
-; WIN32-NEXT:    testl %ecx, %ecx
-; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB6_6
-; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:  LBB6_6: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    imull %ebp, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %edi, %ebp
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %esi, %ecx
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
-; WIN32-NEXT:  LBB6_12: # %overflow.res
-; WIN32-NEXT:    movzbl %al, %esi
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    negl %eax
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    addl %esi, %ebp
-; WIN32-NEXT:    xorl %ebx, %ebx
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    subl %esi, %edi
-; WIN32-NEXT:    setb %bl
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    xorl %edx, %eax
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    subl %ebx, %edx
-; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %esi, 4(%eax)
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    jmp LBB6_16
-; WIN32-NEXT:  LBB6_14: # %overflow.no
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    mull %edx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    imull %edi, %ebx
-; WIN32-NEXT:    addl %edx, %ebx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:  LBB6_16: # %overflow.res
-; WIN32-NEXT:    movl %ebp, (%esi)
-; WIN32-NEXT:    movl %ecx, 4(%esi)
-; WIN32-NEXT:    andb $1, %al
-; WIN32-NEXT:    # kill: def $al killed $al killed $eax
-; WIN32-NEXT:    addl $16, %esp
+; WIN32-NEXT:    addl $8, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -597,93 +449,37 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    je LBB10_5
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    je LBB10_2
-; WIN32-NEXT:  # %bb.7: # %overflow
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %al, %cl
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; WIN32-NEXT:    leal (%edx,%eax), %ebx
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    setb %dl
-; WIN32-NEXT:    orb %ch, %dl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    jmp LBB10_8
-; WIN32-NEXT:  LBB10_5: # %overflow.no.lhs
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    je LBB10_6
-; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %ebp, %ebx
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    jmp LBB10_3
-; WIN32-NEXT:  LBB10_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %ebx, %ebp
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %edi, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:  LBB10_3: # %overflow.res
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    jmp LBB10_8
-; WIN32-NEXT:  LBB10_6: # %overflow.no
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %esi, %ebx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %edi, %ebx
-; WIN32-NEXT:    xorl %edx, %edx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:  LBB10_8: # %overflow.res
-; WIN32-NEXT:    movl %esi, (%ecx)
-; WIN32-NEXT:    movl %eax, 4(%ecx)
-; WIN32-NEXT:    andb $1, %dl
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    leal (%edi,%eax), %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    orb %ch, %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -751,215 +547,66 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloselecti64:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl %ebx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    subl %ecx, %edx
-; WIN32-NEXT:    je LBB12_13
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    subl %esi, %ecx
-; WIN32-NEXT:    je LBB12_2
-; WIN32-NEXT:  # %bb.15: # %overflow
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    imull %esi, %ecx
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    imull %edi, %esi
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    adcl %ecx, %ebx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    adcl %esi, %ebx
 ; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    sarl $31, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    imull %eax, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    addl %esi, %edi
-; WIN32-NEXT:    adcl %ecx, %ebp
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    addl %ebx, %ebp
+; WIN32-NEXT:    addl %ebp, %edi
+; WIN32-NEXT:    adcl %esi, %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    addl %ebx, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    imull %ebx
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    adcl %esi, %edx
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
 ; WIN32-NEXT:    sarl $31, %edi
 ; WIN32-NEXT:    xorl %edi, %edx
 ; WIN32-NEXT:    xorl %eax, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    orl %edx, %edi
+; WIN32-NEXT:    jne LBB12_2
+; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    testb $1, %cl
-; WIN32-NEXT:    je LBB12_17
-; WIN32-NEXT:    jmp LBB12_18
-; WIN32-NEXT:  LBB12_13: # %overflow.no.lhs
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    subl %esi, %ecx
-; WIN32-NEXT:    je LBB12_14
-; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    xorl %ecx, %esi
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    xorl %ecx, %edi
-; WIN32-NEXT:    subl %ecx, %edi
-; WIN32-NEXT:    sbbl %ecx, %esi
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB12_10
-; WIN32-NEXT:  # %bb.9: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebp, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:  LBB12_10: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    subl %eax, %ebp
-; WIN32-NEXT:    sbbl %eax, %ecx
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB12_12
-; WIN32-NEXT:  # %bb.11: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:  LBB12_12: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    imull %esi, %ebp
-; WIN32-NEXT:    addl %edx, %ebp
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    adcl %esi, %edi
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; WIN32-NEXT:    movzbl %cl, %esi
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:    negl %ecx
-; WIN32-NEXT:    xorl %ecx, %ebx
-; WIN32-NEXT:    addl %esi, %ebx
-; WIN32-NEXT:    xorl %edx, %edx
-; WIN32-NEXT:    subl %esi, %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    setb %dl
-; WIN32-NEXT:    xorl %ecx, %eax
-; WIN32-NEXT:    addl %edx, %eax
-; WIN32-NEXT:    xorl %edi, %ecx
-; WIN32-NEXT:    subl %edx, %eax
-; WIN32-NEXT:    adcl $0, %ecx
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    jmp LBB12_7
-; WIN32-NEXT:  LBB12_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
 ; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %esi
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB12_4
-; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebx, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:  LBB12_4: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    subl %eax, %ebp
-; WIN32-NEXT:    sbbl %eax, %ecx
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB12_6
-; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:  LBB12_6: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    imull %esi, %ebp
-; WIN32-NEXT:    addl %edx, %ebp
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    adcl %esi, %edi
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; WIN32-NEXT:    movzbl %cl, %esi
-; WIN32-NEXT:    movl %esi, %ecx
-; WIN32-NEXT:    negl %ecx
-; WIN32-NEXT:    xorl %ecx, %ebx
-; WIN32-NEXT:    addl %esi, %ebx
-; WIN32-NEXT:    xorl %edx, %edx
-; WIN32-NEXT:    subl %esi, %ebx
-; WIN32-NEXT:    setb %dl
-; WIN32-NEXT:    xorl %ecx, %eax
-; WIN32-NEXT:    addl %edx, %eax
-; WIN32-NEXT:    xorl %edi, %ecx
-; WIN32-NEXT:    subl %edx, %eax
-; WIN32-NEXT:    adcl $0, %ecx
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:  LBB12_7: # %overflow.res
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    testb $1, %cl
-; WIN32-NEXT:    jne LBB12_18
-; WIN32-NEXT:  LBB12_17: # %overflow.res
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:  LBB12_18: # %overflow.res
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:  LBB12_2:
+; WIN32-NEXT:    movl %esi, %edx
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB12_14: # %overflow.no
-; WIN32-NEXT:    xorl %ecx, %ecx
-; WIN32-NEXT:    testb $1, %cl
-; WIN32-NEXT:    je LBB12_17
-; WIN32-NEXT:    jmp LBB12_18
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -1023,86 +670,45 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloselecti64:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    je LBB14_5
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    testl %edi, %edi
-; WIN32-NEXT:    je LBB14_2
-; WIN32-NEXT:  # %bb.7: # %overflow
+; WIN32-NEXT:    testl %ebp, %ebp
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %al, %cl
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %al, %bl
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
-; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    addl %eax, %ebp
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    mull %edx
-; WIN32-NEXT:    addl %ebp, %edx
-; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %ch, %al
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    je LBB14_9
-; WIN32-NEXT:    jmp LBB14_10
-; WIN32-NEXT:  LBB14_5: # %overflow.no.lhs
-; WIN32-NEXT:    testl %edi, %edi
-; WIN32-NEXT:    je LBB14_6
-; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    orb %bl, %bh
+; WIN32-NEXT:    addl %eax, %edi
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %bh, %al
+; WIN32-NEXT:    testb %al, %al
+; WIN32-NEXT:    jne LBB14_2
+; WIN32-NEXT:  # %bb.1:
 ; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    imull %esi, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    movl %esi, %ebp
-; WIN32-NEXT:    imull %eax, %ebp
-; WIN32-NEXT:    movl %eax, %edx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %edx
-; WIN32-NEXT:    jmp LBB14_3
-; WIN32-NEXT:  LBB14_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    imull %edi, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    movl %edi, %ebp
-; WIN32-NEXT:    imull %esi, %ebp
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:  LBB14_3: # %overflow.res
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    jne LBB14_10
-; WIN32-NEXT:  LBB14_9: # %overflow.res
-; WIN32-NEXT:    movl %edi, %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:  LBB14_10: # %overflow.res
-; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:  LBB14_2:
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    movl %esi, %edx
 ; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
@@ -1110,12 +716,6 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB14_6: # %overflow.no
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    je LBB14_9
-; WIN32-NEXT:    jmp LBB14_10
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
@@ -1352,47 +952,35 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smulobri64:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $8, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    sarl $31, %edx
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    subl %edx, %esi
-; WIN32-NEXT:    je LBB18_12
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB18_2
-; WIN32-NEXT:  # %bb.14: # %overflow1
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl %ebp, %ecx
+; WIN32-NEXT:    sarl $31, %ecx
+; WIN32-NEXT:    imull %edi, %ecx
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %edi
 ; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    movl %eax, %ebp
 ; WIN32-NEXT:    addl %ebx, %ebp
-; WIN32-NEXT:    adcl %esi, %edi
+; WIN32-NEXT:    adcl %ecx, %edi
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ecx, %edx
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    imull %eax, %ecx
+; WIN32-NEXT:    imull %esi, %ecx
+; WIN32-NEXT:    movl %esi, %eax
 ; WIN32-NEXT:    mull %edx
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %esi
@@ -1401,7 +989,7 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %ebx, %ebp
 ; WIN32-NEXT:    sarl $31, %ebp
 ; WIN32-NEXT:    addl %edi, %ebx
-; WIN32-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; WIN32-NEXT:    adcl (%esp), %ebp # 4-byte Folded Reload
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    addl %ebx, %eax
@@ -1410,148 +998,19 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    xorl %esi, %edx
 ; WIN32-NEXT:    xorl %eax, %esi
 ; WIN32-NEXT:    orl %edx, %esi
-; WIN32-NEXT:    jmp LBB18_15
-; WIN32-NEXT:  LBB18_12: # %overflow.no.lhs
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB18_13
-; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    movl %edi, %ebx
-; WIN32-NEXT:    xorl %eax, %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %ebx
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB18_9
-; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %esi, %edi
-; WIN32-NEXT:  LBB18_9: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    subl %eax, %esi
-; WIN32-NEXT:    sbbl %eax, %ebp
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB18_11
-; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:  LBB18_11: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %ebp, %ebx
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; WIN32-NEXT:    movzbl %bl, %edi
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    negl %esi
-; WIN32-NEXT:    xorl %esi, %ecx
-; WIN32-NEXT:    addl %edi, %ecx
-; WIN32-NEXT:    xorl %ebx, %ebx
-; WIN32-NEXT:    subl %edi, %ecx
-; WIN32-NEXT:    setb %bl
-; WIN32-NEXT:    xorl %esi, %eax
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    xorl %edx, %esi
-; WIN32-NEXT:    subl %ebx, %eax
-; WIN32-NEXT:    adcl $0, %esi
-; WIN32-NEXT:    jmp LBB18_15
-; WIN32-NEXT:  LBB18_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebp, %ebx
-; WIN32-NEXT:    xorl %eax, %ebx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %ebx
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    js LBB18_4
-; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebp, %ebx
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:  LBB18_4: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    movl %esi, %ebp
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    subl %eax, %ebp
-; WIN32-NEXT:    sbbl %eax, %ecx
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB18_6
-; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %esi, %ebp
-; WIN32-NEXT:  LBB18_6: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    imull %ebx, %ebp
-; WIN32-NEXT:    addl %edx, %ebp
-; WIN32-NEXT:    imull %ecx, %ebx
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; WIN32-NEXT:    movzbl %cl, %edi
-; WIN32-NEXT:    movl %edi, %ecx
-; WIN32-NEXT:    negl %ecx
-; WIN32-NEXT:    xorl %ecx, %esi
-; WIN32-NEXT:    addl %edi, %esi
-; WIN32-NEXT:    xorl %ebx, %ebx
-; WIN32-NEXT:    subl %edi, %esi
-; WIN32-NEXT:    setb %bl
-; WIN32-NEXT:    xorl %ecx, %eax
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    xorl %edx, %ecx
-; WIN32-NEXT:    subl %ebx, %eax
-; WIN32-NEXT:    adcl $0, %ecx
-; WIN32-NEXT:  LBB18_15: # %overflow.res
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    jne LBB18_17
-; WIN32-NEXT:  LBB18_19: # %continue
+; WIN32-NEXT:    jne LBB18_1
+; WIN32-NEXT:  # %bb.3: # %continue
 ; WIN32-NEXT:    movb $1, %al
-; WIN32-NEXT:  LBB18_18: # %overflow
-; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:  LBB18_2: # %overflow
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB18_13: # %overflow.no
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    je LBB18_19
-; WIN32-NEXT:  LBB18_17: # %overflow
+; WIN32-NEXT:  LBB18_1: # %overflow
 ; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    jmp LBB18_18
+; WIN32-NEXT:    jmp LBB18_2
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -1802,90 +1261,46 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umulobri64:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    je LBB22_5
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    testl %edi, %edi
-; WIN32-NEXT:    je LBB22_2
-; WIN32-NEXT:  # %bb.7: # %overflow1
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    andb %al, %dl
-; WIN32-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    setne %cl
+; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    seto %bh
-; WIN32-NEXT:    orb %bl, %bh
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    leal (%ebp,%eax), %edi
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    addl %edi, %edx
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    seto %ch
+; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    leal (%edi,%eax), %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %bh, %al
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    je LBB22_11
-; WIN32-NEXT:    jmp LBB22_9
-; WIN32-NEXT:  LBB22_5: # %overflow.no.lhs
-; WIN32-NEXT:    testl %edi, %edi
-; WIN32-NEXT:    je LBB22_6
-; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    imull %ebx, %ecx
-; WIN32-NEXT:    addl %edx, %ecx
-; WIN32-NEXT:    imull %edi, %ebx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    addl %ecx, %eax
-; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    jmp LBB22_3
-; WIN32-NEXT:  LBB22_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    adcl %edi, %edx
-; WIN32-NEXT:  LBB22_3: # %overflow.res
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    jne LBB22_9
-; WIN32-NEXT:  LBB22_11: # %continue
+; WIN32-NEXT:    orb %ch, %al
+; WIN32-NEXT:    subb $1, %al
+; WIN32-NEXT:    je LBB22_1
+; WIN32-NEXT:  # %bb.3: # %continue
 ; WIN32-NEXT:    movb $1, %al
-; WIN32-NEXT:  LBB22_10: # %overflow
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:  LBB22_2: # %overflow
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    popl %ebp
 ; WIN32-NEXT:    retl
-; WIN32-NEXT:  LBB22_6: # %overflow.no
+; WIN32-NEXT:  LBB22_1: # %overflow
 ; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    testb $1, %al
-; WIN32-NEXT:    je LBB22_11
-; WIN32-NEXT:  LBB22_9: # %overflow
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:    jmp LBB22_10
+; WIN32-NEXT:    jmp LBB22_2
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -1919,33 +1334,18 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: bug27873:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebx
-; WIN32-NEXT:    pushl %edi
-; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    je LBB23_2
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl $160, %ebx
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %edi, %eax
-; WIN32-NEXT:    adcl $0, %edx
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    jmp LBB23_3
-; WIN32-NEXT:  LBB23_2: # %overflow.no.lhs
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:  LBB23_3: # %overflow.res
-; WIN32-NEXT:    orb %al, %cl
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
+; WIN32-NEXT:    movl $160, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %ecx
+; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    movl $160, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %ecx, %edx
+; WIN32-NEXT:    setb %al
+; WIN32-NEXT:    orb %bl, %al
+; WIN32-NEXT:    orb {{[0-9]+}}(%esp), %al
 ; WIN32-NEXT:    popl %ebx
 ; WIN32-NEXT:    retl
   %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
@@ -2235,208 +1635,62 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64_load:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $16, %esp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %edi
-; WIN32-NEXT:    movl 4(%eax), %ecx
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    sarl $31, %edx
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    subl %edx, %esi
+; WIN32-NEXT:    subl $12, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %ebx, %edx
-; WIN32-NEXT:    je LBB30_13
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB30_2
-; WIN32-NEXT:  # %bb.15: # %overflow
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    sarl $31, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    imull %esi, %ebp
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %ebx, %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl (%eax), %ecx
+; WIN32-NEXT:    movl 4(%eax), %ebp
+; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    imull %ebx, %esi
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl %edx, %edi
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl %ebp, %edi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    movl %edx, %ebx
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %edi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    adcl %esi, %ebx
+; WIN32-NEXT:    movl %ebx, %edi
 ; WIN32-NEXT:    sarl $31, %edi
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; WIN32-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; WIN32-NEXT:    movl %eax, %esi
+; WIN32-NEXT:    sarl $31, %esi
+; WIN32-NEXT:    imull %ecx, %esi
 ; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    imull %ebx
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    addl %ebp, %eax
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; WIN32-NEXT:    adcl %edi, %edx
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    adcl %esi, %ecx
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; WIN32-NEXT:    imull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl %edi, %ecx
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    jmp LBB30_16
-; WIN32-NEXT:  LBB30_13: # %overflow.no.lhs
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB30_14
-; WIN32-NEXT:  # %bb.7: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    subl %eax, %esi
-; WIN32-NEXT:    sbbl %eax, %ebp
-; WIN32-NEXT:    testl %ecx, %ecx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB30_9
-; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ecx, %ebp
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:  LBB30_9: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %ecx
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB30_11
-; WIN32-NEXT:  # %bb.10: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebx, %ecx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:  LBB30_11: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebx
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %ecx, %ebp
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %edi, %ecx
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; WIN32-NEXT:    jmp LBB30_12
-; WIN32-NEXT:  LBB30_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    subl %eax, %esi
-; WIN32-NEXT:    sbbl %eax, %ebp
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB30_4
-; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebx, %ebp
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:  LBB30_4: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %esi, %edx
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ecx, %ebx
-; WIN32-NEXT:    xorl %eax, %ebx
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    subl %eax, %esi
-; WIN32-NEXT:    sbbl %eax, %ebx
-; WIN32-NEXT:    testl %ecx, %ecx
-; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB30_6
-; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ecx, %ebx
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:  LBB30_6: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edx, %ecx
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    imull %ebp, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %ebx, %ebp
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %esi, %ecx
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:    movl %edi, %ebp
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
-; WIN32-NEXT:  LBB30_12: # %overflow.res
-; WIN32-NEXT:    movzbl %al, %esi
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    negl %eax
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    addl %esi, %ebp
-; WIN32-NEXT:    xorl %ebx, %ebx
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    subl %esi, %edi
-; WIN32-NEXT:    setb %bl
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    xorl %edx, %eax
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    subl %ebx, %edx
-; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %esi, 4(%eax)
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    jmp LBB30_16
-; WIN32-NEXT:  LBB30_14: # %overflow.no
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    mull %edx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    imull %edi, %ebx
-; WIN32-NEXT:    addl %edx, %ebx
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:  LBB30_16: # %overflow.res
-; WIN32-NEXT:    movl %ebp, (%esi)
-; WIN32-NEXT:    movl %ecx, 4(%esi)
-; WIN32-NEXT:    andb $1, %al
-; WIN32-NEXT:    # kill: def $al killed $al killed $eax
-; WIN32-NEXT:    addl $16, %esp
+; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -2474,206 +1728,61 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: smuloi64_load2:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    subl $12, %esp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl (%edx), %ebx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    subl %esi, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl 4(%edx), %ebp
-; WIN32-NEXT:    movl %ebp, %edx
-; WIN32-NEXT:    je LBB31_13
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB31_2
-; WIN32-NEXT:  # %bb.15: # %overflow
-; WIN32-NEXT:    movl %ecx, %esi
+; WIN32-NEXT:    movl (%ecx), %ebx
+; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    sarl $31, %esi
 ; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    mull %ebx
 ; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %eax, %ebp
+; WIN32-NEXT:    addl %ecx, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl 4(%eax), %ecx
+; WIN32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    adcl %esi, %ebx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    movl %ebp, %esi
+; WIN32-NEXT:    movl %ebx, %edi
+; WIN32-NEXT:    sarl $31, %edi
+; WIN32-NEXT:    movl %ecx, %esi
 ; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %esi
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    addl %ebp, %eax
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    adcl %esi, %edi
-; WIN32-NEXT:    movl %edi, %esi
-; WIN32-NEXT:    sarl $31, %esi
-; WIN32-NEXT:    addl %ebx, %edi
-; WIN32-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    imull %ebp
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; WIN32-NEXT:    addl %edi, %eax
-; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    adcl %esi, %ecx
+; WIN32-NEXT:    movl %ecx, %ebp
+; WIN32-NEXT:    sarl $31, %ebp
+; WIN32-NEXT:    addl %ebx, %ecx
+; WIN32-NEXT:    adcl %edi, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull (%esp) # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ecx, %eax
+; WIN32-NEXT:    adcl %ebp, %edx
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    xorl %ecx, %edx
 ; WIN32-NEXT:    xorl %eax, %ecx
 ; WIN32-NEXT:    orl %edx, %ecx
-; WIN32-NEXT:    movl %edi, %ecx
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    jmp LBB31_16
-; WIN32-NEXT:  LBB31_13: # %overflow.no.lhs
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    je LBB31_14
-; WIN32-NEXT:  # %bb.8: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %esi
-; WIN32-NEXT:    testl %ecx, %ecx
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB31_10
-; WIN32-NEXT:  # %bb.9: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ecx, %esi
-; WIN32-NEXT:    movl %edx, %edi
-; WIN32-NEXT:  LBB31_10: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %edi, %edx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    subl %eax, %edi
-; WIN32-NEXT:    sbbl %eax, %ecx
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB31_12
-; WIN32-NEXT:  # %bb.11: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    movl %ebx, %edi
-; WIN32-NEXT:  LBB31_12: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %edx, %ebx
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    imull %esi, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %ecx, %esi
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %edi, %ecx
-; WIN32-NEXT:    adcl %esi, %edx
-; WIN32-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; WIN32-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
-; WIN32-NEXT:    jmp LBB31_7
-; WIN32-NEXT:  LBB31_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    xorl %eax, %edi
-; WIN32-NEXT:    movl %ebx, %edx
-; WIN32-NEXT:    xorl %eax, %edx
-; WIN32-NEXT:    subl %eax, %edx
-; WIN32-NEXT:    sbbl %eax, %edi
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB31_4
-; WIN32-NEXT:  # %bb.3: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    movl %ebx, %edx
-; WIN32-NEXT:  LBB31_4: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edx, %ebp
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    sarl $31, %eax
-; WIN32-NEXT:    movl %ecx, %ebx
-; WIN32-NEXT:    xorl %eax, %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    xorl %eax, %esi
-; WIN32-NEXT:    subl %eax, %esi
-; WIN32-NEXT:    sbbl %eax, %ebx
-; WIN32-NEXT:    testl %ecx, %ecx
-; WIN32-NEXT:    sets (%esp) # 1-byte Folded Spill
-; WIN32-NEXT:    js LBB31_6
-; WIN32-NEXT:  # %bb.5: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ecx, %ebx
-; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:  LBB31_6: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    imull %edi, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    movl %ecx, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    addl %esi, %ecx
-; WIN32-NEXT:    adcl %edi, %edx
-; WIN32-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; WIN32-NEXT:    xorb (%esp), %al # 1-byte Folded Reload
-; WIN32-NEXT:  LBB31_7: # %overflow.res
-; WIN32-NEXT:    movzbl %al, %esi
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    negl %eax
-; WIN32-NEXT:    xorl %eax, %ebp
-; WIN32-NEXT:    addl %esi, %ebp
-; WIN32-NEXT:    xorl %ebx, %ebx
-; WIN32-NEXT:    movl %ebp, %edi
-; WIN32-NEXT:    subl %esi, %edi
-; WIN32-NEXT:    setb %bl
-; WIN32-NEXT:    xorl %eax, %ecx
-; WIN32-NEXT:    addl %ebx, %ecx
-; WIN32-NEXT:    xorl %edx, %eax
-; WIN32-NEXT:    movl %ecx, %edx
-; WIN32-NEXT:    subl %ebx, %edx
-; WIN32-NEXT:    adcl $0, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %esi, 4(%eax)
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    jmp LBB31_16
-; WIN32-NEXT:  LBB31_14: # %overflow.no
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    imull %edi, %ebp
-; WIN32-NEXT:    addl %edx, %ebp
-; WIN32-NEXT:    imull %ebx, %ecx
-; WIN32-NEXT:    addl %ebp, %ecx
-; WIN32-NEXT:    movl %eax, %ebp
-; WIN32-NEXT:    xorl %eax, %eax
-; WIN32-NEXT:  LBB31_16: # %overflow.res
-; WIN32-NEXT:    movl %ebp, (%esi)
-; WIN32-NEXT:    movl %ecx, 4(%esi)
-; WIN32-NEXT:    andb $1, %al
-; WIN32-NEXT:    # kill: def $al killed $al killed $eax
 ; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
@@ -3024,94 +2133,38 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %edi
-; WIN32-NEXT:    movl 4(%eax), %ebx
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    je LBB38_5
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    je LBB38_2
-; WIN32-NEXT:  # %bb.7: # %overflow
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    movl (%eax), %ebp
+; WIN32-NEXT:    movl 4(%eax), %eax
+; WIN32-NEXT:    testl %esi, %esi
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %al, %cl
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %edi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; WIN32-NEXT:    leal (%edx,%eax), %ebx
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    setb %dl
-; WIN32-NEXT:    orb %ch, %dl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    jmp LBB38_8
-; WIN32-NEXT:  LBB38_5: # %overflow.no.lhs
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    je LBB38_6
-; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %ebp, %ebx
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    jmp LBB38_3
-; WIN32-NEXT:  LBB38_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %ebx, %ebp
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %edi, %eax
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:  LBB38_3: # %overflow.res
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    jmp LBB38_8
-; WIN32-NEXT:  LBB38_6: # %overflow.no
-; WIN32-NEXT:    imull %ebp, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %esi, %ebx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl %edi, %ebx
-; WIN32-NEXT:    xorl %edx, %edx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:  LBB38_8: # %overflow.res
-; WIN32-NEXT:    movl %esi, (%ecx)
-; WIN32-NEXT:    movl %eax, 4(%ecx)
-; WIN32-NEXT:    andb $1, %dl
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    leal (%edi,%eax), %esi
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    orb %ch, %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -3157,94 +2210,38 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN64-NEXT:    retq
 ;
 ; WIN32-LABEL: umuloi64_load2:
-; WIN32:       # %bb.0: # %overflow.entry
+; WIN32:       # %bb.0:
 ; WIN32-NEXT:    pushl %ebp
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %edi
-; WIN32-NEXT:    movl 4(%eax), %ebp
-; WIN32-NEXT:    testl %ebx, %ebx
-; WIN32-NEXT:    je LBB39_5
-; WIN32-NEXT:  # %bb.1: # %overflow.lhs
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    je LBB39_2
-; WIN32-NEXT:  # %bb.7: # %overflow
-; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    testl %ebx, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl (%ecx), %ebp
+; WIN32-NEXT:    movl 4(%ecx), %esi
+; WIN32-NEXT:    testl %eax, %eax
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %al, %cl
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; WIN32-NEXT:    andb %dl, %cl
+; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto %bl
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    seto %ch
 ; WIN32-NEXT:    orb %bl, %ch
 ; WIN32-NEXT:    orb %cl, %ch
-; WIN32-NEXT:    movl (%esp), %edx # 4-byte Reload
-; WIN32-NEXT:    leal (%edx,%eax), %ebx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    addl %ebx, %eax
-; WIN32-NEXT:    setb %dl
-; WIN32-NEXT:    orb %ch, %dl
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    jmp LBB39_8
-; WIN32-NEXT:  LBB39_5: # %overflow.no.lhs
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    testl %ebp, %ebp
-; WIN32-NEXT:    je LBB39_6
-; WIN32-NEXT:  # %bb.4: # %overflow.no.lhs.only
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    imull %ebx, %edi
-; WIN32-NEXT:    addl %edx, %edi
-; WIN32-NEXT:    imull %ebp, %ebx
-; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; WIN32-NEXT:    leal (%edi,%eax), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    addl %edi, %eax
-; WIN32-NEXT:    adcl %ebx, %edx
-; WIN32-NEXT:    jmp LBB39_3
-; WIN32-NEXT:  LBB39_2: # %overflow.no.rhs.only
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT:    imull %ebp, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %ebx, %ebp
-; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebx
-; WIN32-NEXT:    addl %esi, %eax
-; WIN32-NEXT:    movl (%esp), %esi # 4-byte Reload
-; WIN32-NEXT:    adcl %ebp, %edx
-; WIN32-NEXT:  LBB39_3: # %overflow.res
-; WIN32-NEXT:    testl %edx, %edx
-; WIN32-NEXT:    setne %dl
-; WIN32-NEXT:    jmp LBB39_8
-; WIN32-NEXT:  LBB39_6: # %overflow.no
-; WIN32-NEXT:    imull %ebp, %esi
-; WIN32-NEXT:    addl %edx, %esi
-; WIN32-NEXT:    imull %edi, %ebx
-; WIN32-NEXT:    addl %esi, %ebx
-; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    xorl %edx, %edx
-; WIN32-NEXT:    movl %ebx, %eax
-; WIN32-NEXT:  LBB39_8: # %overflow.res
-; WIN32-NEXT:    movl %esi, (%ecx)
-; WIN32-NEXT:    movl %eax, 4(%ecx)
-; WIN32-NEXT:    andb $1, %dl
-; WIN32-NEXT:    movl %edx, %eax
-; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    addl %esi, %edx
+; WIN32-NEXT:    setb %cl
+; WIN32-NEXT:    orb %ch, %cl
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %eax, (%esi)
+; WIN32-NEXT:    movl %edx, 4(%esi)
+; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx

From cd2815f8739c7e8f3b8817bc88bbbead57725413 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Fri, 1 Aug 2025 23:59:00 +0000
Subject: [PATCH 03/12] for the case of overflow.no, multiply the low parts of
 LHS and RHS

Change-Id: Ib0619bde982a8d2a5eba889e12c9412705afebee
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  8 +++--
 llvm/test/CodeGen/AArch64/i128-math.ll        | 29 +++++--------------
 .../CodeGen/AArch64/i128_with_overflow.ll     | 11 ++-----
 .../umulo-128-legalisation-lowering.ll        | 10 ++-----
 4 files changed, 20 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9db998f24482c..df9301c765420 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6459,7 +6459,9 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   //------------------------------------------------------------------------------
   // BB overflow.no:
   Builder.SetInsertPoint(NoOverflowBB);
-  auto *Mul = Builder.CreateMul(LHS, RHS, "mul.no.overflow");
+  auto *ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
+  auto *ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
+  auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
   StructType *STy = StructType::get(I->getContext(),
                         {Ty, IntegerType::getInt1Ty(I->getContext())});
   Value *StructValNoOverflow = PoisonValue::get(STy);
@@ -6557,7 +6559,9 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   //------------------------------------------------------------------------------
   // BB overflow.no:
   Builder.SetInsertPoint(NoOverflowBB);
-  auto *Mul = Builder.CreateMul(LHS, RHS, "mul.no.overflow");
+  auto *ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
+  auto *ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
+  auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
   StructType * STy = StructType::get(I->getContext(),
                         {Ty, IntegerType::getInt1Ty(I->getContext())});
   Value *StructValNoOverflow = PoisonValue::get(STy);
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 5c7aa3e62ec1b..3bd563e9c0839 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -280,11 +280,8 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
 ; CHECK-NEXT:    b .LBB17_3
 ; CHECK-NEXT:  .LBB17_2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    umulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:  .LBB17_3: // %overflow.res
 ; CHECK-NEXT:    mov w9, #1 // =0x1
 ; CHECK-NEXT:    bic w2, w9, w8
@@ -321,11 +318,9 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    and w2, w8, #0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB18_2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    umulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
-; CHECK-NEXT:    and w2, wzr, #0x1
+; CHECK-NEXT:    and w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -357,10 +352,8 @@ define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    csinc w10, w10, wzr, lo
 ; CHECK-NEXT:    b .LBB19_3
 ; CHECK-NEXT:  .LBB19_2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    umulh x9, x0, x2
 ; CHECK-NEXT:    mov w10, wzr
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    madd x9, x1, x2, x8
 ; CHECK-NEXT:    mul x8, x0, x2
 ; CHECK-NEXT:  .LBB19_3: // %overflow.res
 ; CHECK-NEXT:    tst w10, #0x1
@@ -396,11 +389,9 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cmp x3, x8
 ; CHECK-NEXT:    b.ne .LBB21_3
 ; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    smulh x1, x0, x2
 ; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    b .LBB21_4
 ; CHECK-NEXT:  .LBB21_3: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
@@ -454,10 +445,8 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cmp x3, x8
 ; CHECK-NEXT:    b.ne .LBB22_3
 ; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    smulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
 ; CHECK-NEXT:    and w2, wzr, #0x1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB22_3: // %overflow
@@ -509,11 +498,9 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cmp x3, x8
 ; CHECK-NEXT:    b.ne .LBB23_3
 ; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
+; CHECK-NEXT:    smulh x8, x0, x2
 ; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    mul x9, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    madd x8, x1, x2, x8
 ; CHECK-NEXT:    b .LBB23_4
 ; CHECK-NEXT:  .LBB23_3: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 34f8b10c24902..b6a06d55537bd 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -243,11 +243,8 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    tbnz w8, #0, .LBB4_3
 ; CHECK-NEXT:    b .LBB4_4
 ; CHECK-NEXT:  .LBB4_2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
+; CHECK-NEXT:    umulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    tbz w8, #0, .LBB4_4
 ; CHECK-NEXT:  .LBB4_3: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
@@ -289,11 +286,9 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    cmp x3, x8
 ; CHECK-NEXT:    b.ne .LBB5_3
 ; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    umulh x8, x0, x2
-; CHECK-NEXT:    madd x8, x0, x3, x8
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    madd x1, x1, x2, x8
+; CHECK-NEXT:    smulh x1, x0, x2
 ; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    tbnz w8, #0, .LBB5_4
 ; CHECK-NEXT:    b .LBB5_5
 ; CHECK-NEXT:  .LBB5_3: // %overflow
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index 14b1dc7f2d6df..f72b88fbc717c 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -23,11 +23,9 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-NEXT:    and w2, w8, #0x1
 ; AARCH-NEXT:    ret
 ; AARCH-NEXT:  .LBB0_2: // %overflow.no
-; AARCH-NEXT:    umulh x8, x0, x2
-; AARCH-NEXT:    madd x8, x0, x3, x8
+; AARCH-NEXT:    umulh x1, x0, x2
 ; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    madd x1, x1, x2, x8
-; AARCH-NEXT:    and w2, wzr, #0x1
+; AARCH-NEXT:    and w2, w8, #0x1
 ; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
@@ -54,11 +52,9 @@ define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4
 ; AARCH-NEXT:    cmp x3, x8
 ; AARCH-NEXT:    b.ne .LBB1_3
 ; AARCH-NEXT:  // %bb.2: // %overflow.no
-; AARCH-NEXT:    umulh x8, x0, x2
+; AARCH-NEXT:    smulh x8, x0, x2
 ; AARCH-NEXT:    mov w9, wzr
-; AARCH-NEXT:    madd x8, x0, x3, x8
 ; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    madd x8, x1, x2, x8
 ; AARCH-NEXT:    tbnz x1, #63, .LBB1_4
 ; AARCH-NEXT:    b .LBB1_5
 ; AARCH-NEXT:  .LBB1_3: // %overflow

From cd23298dbd44cfdea661f0eeaaeaa7415a040108 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Sat, 2 Aug 2025 00:12:04 +0000
Subject: [PATCH 04/12] rebase

Change-Id: I77f0f79e2c1057fa48db53be37e8a74af50c42e8
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 36 ++++++++++---------
 .../X86/umulo-128-legalisation-lowering.ll    |  6 ++--
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index df9301c765420..e3bd394b710e6 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6417,8 +6417,10 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   unsigned VTHalfBitWidth = VTBitWidth / 2;
   auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
 
-  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the target.
-  if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) != TargetLowering::TypeLegal)
+  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the
+  // target.
+  if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) !=
+      TargetLowering::TypeLegal)
     return false;
 
   I->getParent()->setName("overflow.res");
@@ -6449,9 +6451,9 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   auto *HiLHS = Builder.CreateTrunc(ShrHiLHS, LegalTy, "hi.lhs.trunc");
 
   auto *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
-                                       ConstantInt::getNullValue(LegalTy));
+                                   ConstantInt::getNullValue(LegalTy));
   auto *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
-                                         ConstantInt::getNullValue(LegalTy));
+                                   ConstantInt::getNullValue(LegalTy));
   auto *Or = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
   Builder.CreateCondBr(Or, OverflowBB, NoOverflowBB);
   OverflowoEntryBB->getTerminator()->eraseFromParent();
@@ -6462,14 +6464,14 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   auto *ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
   auto *ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
   auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
-  StructType *STy = StructType::get(I->getContext(),
-                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
   Value *StructValNoOverflow = PoisonValue::get(STy);
   StructValNoOverflow =
-  Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
+      Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
   StructValNoOverflow = Builder.CreateInsertValue(
       StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
-      Builder.CreateBr(OverflowResBB);
+  Builder.CreateBr(OverflowResBB);
 
   //------------------------------------------------------------------------------
   // BB overflow.res:
@@ -6514,8 +6516,10 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   unsigned VTHalfBitWidth = VTBitWidth / 2;
   auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
 
-  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the target.
-  if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) != TargetLowering::TypeLegal)
+  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the
+  // target.
+  if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) !=
+      TargetLowering::TypeLegal)
     return false;
 
   I->getParent()->setName("overflow.res");
@@ -6540,13 +6544,13 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   IRBuilder<> Builder(OverflowoEntryBB->getTerminator());
   auto *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs");
   auto *SignLoRHS =
-  Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+      Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
   auto *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
   HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
 
   auto *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
   auto *SignLoLHS =
-  Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+      Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
   auto *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
   HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
 
@@ -6562,14 +6566,14 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   auto *ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
   auto *ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
   auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
-  StructType * STy = StructType::get(I->getContext(),
-                        {Ty, IntegerType::getInt1Ty(I->getContext())});
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
   Value *StructValNoOverflow = PoisonValue::get(STy);
   StructValNoOverflow =
-  Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
+      Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
   StructValNoOverflow = Builder.CreateInsertValue(
       StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
-      Builder.CreateBr(OverflowResBB);
+  Builder.CreateBr(OverflowResBB);
 
   //------------------------------------------------------------------------------
   // BB overflow.res:
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4c3170304b980..89afd1b00444b 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -38,8 +38,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 44
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 48
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -147,7 +147,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16

From 7d1df2fb5af7ca9ecbaf2d7226c3ef849a9ee4bf Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Sun, 3 Aug 2025 03:31:25 +0000
Subject: [PATCH 05/12] reduce number of cmp instr. use multiple PHIs of scalar
 types instead of aggregate type.

Change-Id: Ie6bc78eda41f454e9edeea7b3bf2c21da1a89693
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 88 ++++++++++-------
 llvm/test/CodeGen/AArch64/i128-math.ll        | 94 +++++++++----------
 .../CodeGen/AArch64/i128_with_overflow.ll     | 35 ++++---
 .../umulo-128-legalisation-lowering.ll        | 44 +++++----
 4 files changed, 136 insertions(+), 125 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index e3bd394b710e6..d0cd70801c842 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6417,7 +6417,7 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   unsigned VTHalfBitWidth = VTBitWidth / 2;
   auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
 
-  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the
+  // Skip the optimization if the type with HalfBitWidth is not legal for the
   // target.
   if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) !=
       TargetLowering::TypeLegal)
@@ -6464,32 +6464,40 @@ bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
   auto *ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
   auto *ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
   auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
-  StructType *STy = StructType::get(
-      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
-  Value *StructValNoOverflow = PoisonValue::get(STy);
-  StructValNoOverflow =
-      Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
-  StructValNoOverflow = Builder.CreateInsertValue(
-      StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
   Builder.CreateBr(OverflowResBB);
 
   //------------------------------------------------------------------------------
   // BB overflow.res:
   Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
-  auto *PHINode = Builder.CreatePHI(STy, 2);
-  PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
+  auto *PHINode1 = Builder.CreatePHI(Ty, 2);
+  PHINode1->addIncoming(Mul, NoOverflowBB);
+  auto *PHINode2 =
+      Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
+  PHINode2->addIncoming(ConstantInt::getFalse(I->getContext()), NoOverflowBB);
 
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValOverflowRes = PoisonValue::get(STy);
+  StructValOverflowRes =
+      Builder.CreateInsertValue(StructValOverflowRes, PHINode1, {0});
+  StructValOverflowRes =
+      Builder.CreateInsertValue(StructValOverflowRes, PHINode2, {1});
   // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
-  // uses by PHINode.
-  I->replaceAllUsesWith(PHINode);
+  // uses by StructValOverflowRes.
+  I->replaceAllUsesWith(StructValOverflowRes);
+  I->removeFromParent();
 
   // BB overflow:
-  PHINode->addIncoming(I, OverflowBB);
-  I->removeFromParent();
   I->insertInto(OverflowBB, OverflowBB->end());
   Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
+  auto *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
+  auto *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
   Builder.CreateBr(OverflowResBB);
 
+  // Add The Extracted values to the PHINodes in the overflow.res block.
+  PHINode1->addIncoming(MulOverflow, OverflowBB);
+  PHINode2->addIncoming(OverflowFlag, OverflowBB);
+
   // return false to stop reprocessing the function.
   return false;
 }
@@ -6516,7 +6524,7 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   unsigned VTHalfBitWidth = VTBitWidth / 2;
   auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
 
-  // Skip the optimizaiton if the type with HalfBitWidth is not legal for the
+  // Skip the optimization if the type with HalfBitWidth is not legal for the
   // target.
   if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) !=
       TargetLowering::TypeLegal)
@@ -6553,11 +6561,17 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
       Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
   auto *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
   HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
-
-  auto *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS, SignLoLHS);
-  auto *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
-  auto *Or = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
-  Builder.CreateCondBr(Or, OverflowBB, NoOverflowBB);
+  // xor(HiLHS, SignLoLHS) false -> no overflow
+  // xor(HiRHS, SignLoRHS) false -> no overflow
+  // if either of the above is true, then overflow.
+  // auto *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS, SignLoLHS);
+  auto *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS);
+  auto *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS);
+  // auto *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
+  auto *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs");
+  auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, Or,
+                                ConstantInt::get(Or->getType(), 1));
+  Builder.CreateCondBr(Cmp, OverflowBB, NoOverflowBB);
   OverflowoEntryBB->getTerminator()->eraseFromParent();
 
   //------------------------------------------------------------------------------
@@ -6566,32 +6580,40 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   auto *ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
   auto *ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
   auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
-  StructType *STy = StructType::get(
-      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
-  Value *StructValNoOverflow = PoisonValue::get(STy);
-  StructValNoOverflow =
-      Builder.CreateInsertValue(StructValNoOverflow, Mul, {0});
-  StructValNoOverflow = Builder.CreateInsertValue(
-      StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
   Builder.CreateBr(OverflowResBB);
 
   //------------------------------------------------------------------------------
   // BB overflow.res:
   Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
-  auto *PHINode = Builder.CreatePHI(STy, 2);
-  PHINode->addIncoming(StructValNoOverflow, NoOverflowBB);
+  auto *PHINode1 = Builder.CreatePHI(Ty, 2);
+  PHINode1->addIncoming(Mul, NoOverflowBB);
+  auto *PHINode2 =
+      Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
+  PHINode2->addIncoming(ConstantInt::getFalse(I->getContext()), NoOverflowBB);
 
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
+  Value *StructValOverflowRes = PoisonValue::get(STy);
+  StructValOverflowRes =
+      Builder.CreateInsertValue(StructValOverflowRes, PHINode1, {0});
+  StructValOverflowRes =
+      Builder.CreateInsertValue(StructValOverflowRes, PHINode2, {1});
   // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
-  // uses by PHINode.
-  I->replaceAllUsesWith(PHINode);
+  // uses by StructValOverflowRes.
+  I->replaceAllUsesWith(StructValOverflowRes);
+  I->removeFromParent();
 
   // BB overflow:
-  PHINode->addIncoming(I, OverflowBB);
-  I->removeFromParent();
   I->insertInto(OverflowBB, OverflowBB->end());
   Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
+  auto *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
+  auto *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
   Builder.CreateBr(OverflowResBB);
 
+  // Add The Extracted values to the PHINodes in the overflow.res block.
+  PHINode1->addIncoming(MulOverflow, OverflowBB);
+  PHINode2->addIncoming(OverflowFlag, OverflowBB);
+
   // return false to stop reprocessing the function.
   return false;
 }
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index 3bd563e9c0839..d7e71dc51dcb5 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -278,13 +278,12 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    b .LBB17_3
+; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB17_2: // %overflow.no
 ; CHECK-NEXT:    umulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:  .LBB17_3: // %overflow.res
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    bic w2, w9, w8
+; CHECK-NEXT:    eor w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -314,13 +313,12 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
-; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    csinc w2, w8, wzr, lo
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB18_2: // %overflow.no
 ; CHECK-NEXT:    umulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -356,7 +354,7 @@ define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    mul x8, x0, x2
 ; CHECK-NEXT:  .LBB19_3: // %overflow.res
-; CHECK-NEXT:    tst w10, #0x1
+; CHECK-NEXT:    cmp w10, #0
 ; CHECK-NEXT:    csinv x0, x8, xzr, eq
 ; CHECK-NEXT:    csinv x1, x9, xzr, eq
 ; CHECK-NEXT:    ret
@@ -382,18 +380,12 @@ define i128 @i128_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_checked_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.ne .LBB21_3
-; CHECK-NEXT:  // %bb.1: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.ne .LBB21_3
-; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    smulh x1, x0, x2
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    b .LBB21_4
-; CHECK-NEXT:  .LBB21_3: // %overflow
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    b.ne .LBB21_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -421,9 +413,12 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cmp x8, x10
 ; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:  .LBB21_4: // %overflow.res
-; CHECK-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEXT:    bic w2, w9, w8
+; CHECK-NEXT:    eor w2, w8, #0x1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB21_2: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    eor w2, wzr, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -438,18 +433,12 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_overflowing_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.ne .LBB22_3
-; CHECK-NEXT:  // %bb.1: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.ne .LBB22_3
-; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    smulh x1, x0, x2
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    and w2, wzr, #0x1
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB22_3: // %overflow
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    b.ne .LBB22_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -476,8 +465,12 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    adc x9, x9, x11
 ; CHECK-NEXT:    cmp x8, x10
 ; CHECK-NEXT:    ccmp x9, x10, #0, eq
-; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    and w2, w8, #0x1
+; CHECK-NEXT:    cset w2, ne
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_2: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    mov w2, wzr
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -491,18 +484,12 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_saturating_mul:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.ne .LBB23_3
-; CHECK-NEXT:  // %bb.1: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.ne .LBB23_3
-; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    smulh x8, x0, x2
-; CHECK-NEXT:    mov w10, wzr
-; CHECK-NEXT:    mul x9, x0, x2
-; CHECK-NEXT:    b .LBB23_4
-; CHECK-NEXT:  .LBB23_3: // %overflow
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    b.ne .LBB23_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -529,9 +516,14 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    cmp x11, x14
 ; CHECK-NEXT:    ccmp x10, x14, #0, eq
 ; CHECK-NEXT:    cset w10, ne
-; CHECK-NEXT:  .LBB23_4: // %overflow.res
+; CHECK-NEXT:    b .LBB23_3
+; CHECK-NEXT:  .LBB23_2: // %overflow.no
+; CHECK-NEXT:    smulh x8, x0, x2
+; CHECK-NEXT:    mov w10, wzr
+; CHECK-NEXT:    mul x9, x0, x2
+; CHECK-NEXT:  .LBB23_3: // %overflow.res
 ; CHECK-NEXT:    eor x11, x3, x1
-; CHECK-NEXT:    tst w10, #0x1
+; CHECK-NEXT:    cmp w10, #0
 ; CHECK-NEXT:    asr x11, x11, #63
 ; CHECK-NEXT:    eor x12, x11, #0x7fffffffffffffff
 ; CHECK-NEXT:    csinv x0, x9, x11, eq
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index b6a06d55537bd..75e76472905c2 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -240,12 +240,12 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    tbnz w8, #0, .LBB4_3
+; CHECK-NEXT:    cbnz w8, .LBB4_3
 ; CHECK-NEXT:    b .LBB4_4
 ; CHECK-NEXT:  .LBB4_2: // %overflow.no
 ; CHECK-NEXT:    umulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    tbz w8, #0, .LBB4_4
+; CHECK-NEXT:    cbz w8, .LBB4_4
 ; CHECK-NEXT:  .LBB4_3: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
@@ -279,19 +279,12 @@ cleanup:
 define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_smul_i128:
 ; CHECK:       // %bb.0: // %overflow.entry
-; CHECK-NEXT:    cmp x1, x0, asr #63
-; CHECK-NEXT:    b.ne .LBB5_3
-; CHECK-NEXT:  // %bb.1: // %overflow.entry
-; CHECK-NEXT:    asr x8, x2, #63
-; CHECK-NEXT:    cmp x3, x8
-; CHECK-NEXT:    b.ne .LBB5_3
-; CHECK-NEXT:  // %bb.2: // %overflow.no
-; CHECK-NEXT:    smulh x1, x0, x2
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    tbnz w8, #0, .LBB5_4
-; CHECK-NEXT:    b .LBB5_5
-; CHECK-NEXT:  .LBB5_3: // %overflow
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #1
+; CHECK-NEXT:    b.ne .LBB5_2
+; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
 ; CHECK-NEXT:    asr x13, x3, #63
@@ -319,8 +312,14 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    cmp x8, x10
 ; CHECK-NEXT:    ccmp x9, x10, #0, eq
 ; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    tbz w8, #0, .LBB5_5
-; CHECK-NEXT:  .LBB5_4: // %if.then
+; CHECK-NEXT:    cbnz w8, .LBB5_3
+; CHECK-NEXT:    b .LBB5_4
+; CHECK-NEXT:  .LBB5_2: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    cbz w8, .LBB5_4
+; CHECK-NEXT:  .LBB5_3: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -329,7 +328,7 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:  .LBB5_5: // %cleanup
+; CHECK-NEXT:  .LBB5_4: // %cleanup
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index f72b88fbc717c..f98438593262f 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -19,13 +19,12 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-NEXT:    mul x0, x0, x2
 ; AARCH-NEXT:    cset w8, ne
 ; AARCH-NEXT:    adds x1, x11, x9
-; AARCH-NEXT:    csinc w8, w8, wzr, lo
-; AARCH-NEXT:    and w2, w8, #0x1
+; AARCH-NEXT:    csinc w2, w8, wzr, lo
 ; AARCH-NEXT:    ret
 ; AARCH-NEXT:  .LBB0_2: // %overflow.no
 ; AARCH-NEXT:    umulh x1, x0, x2
 ; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    and w2, w8, #0x1
+; AARCH-NEXT:    mov w2, wzr
 ; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
@@ -44,20 +43,13 @@ start:
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
 ; AARCH:       // %bb.0: // %overflow.entry
-; AARCH-NEXT:    cmp x1, x0, asr #63
+; AARCH-NEXT:    eor x8, x3, x2, asr #63
+; AARCH-NEXT:    eor x9, x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]
-; AARCH-NEXT:    b.ne .LBB1_3
-; AARCH-NEXT:  // %bb.1: // %overflow.entry
-; AARCH-NEXT:    asr x8, x2, #63
-; AARCH-NEXT:    cmp x3, x8
-; AARCH-NEXT:    b.ne .LBB1_3
-; AARCH-NEXT:  // %bb.2: // %overflow.no
-; AARCH-NEXT:    smulh x8, x0, x2
-; AARCH-NEXT:    mov w9, wzr
-; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    tbnz x1, #63, .LBB1_4
-; AARCH-NEXT:    b .LBB1_5
-; AARCH-NEXT:  .LBB1_3: // %overflow
+; AARCH-NEXT:    orr x8, x9, x8
+; AARCH-NEXT:    cmp x8, #1
+; AARCH-NEXT:    b.ne .LBB1_2
+; AARCH-NEXT:  // %bb.1: // %overflow
 ; AARCH-NEXT:    asr x9, x1, #63
 ; AARCH-NEXT:    umulh x10, x0, x2
 ; AARCH-NEXT:    asr x13, x3, #63
@@ -84,17 +76,23 @@ define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4
 ; AARCH-NEXT:    cmp x9, x11
 ; AARCH-NEXT:    ccmp x10, x11, #0, eq
 ; AARCH-NEXT:    cset w9, ne
-; AARCH-NEXT:    tbz x1, #63, .LBB1_5
-; AARCH-NEXT:  .LBB1_4: // %overflow.res
+; AARCH-NEXT:    tbnz x1, #63, .LBB1_3
+; AARCH-NEXT:    b .LBB1_4
+; AARCH-NEXT:  .LBB1_2: // %overflow.no
+; AARCH-NEXT:    smulh x8, x0, x2
+; AARCH-NEXT:    mov w9, wzr
+; AARCH-NEXT:    mul x0, x0, x2
+; AARCH-NEXT:    tbz x1, #63, .LBB1_4
+; AARCH-NEXT:  .LBB1_3: // %overflow.res
 ; AARCH-NEXT:    eor x10, x3, #0x8000000000000000
 ; AARCH-NEXT:    orr x10, x2, x10
-; AARCH-NEXT:    cbz x10, .LBB1_6
-; AARCH-NEXT:  .LBB1_5: // %Else2
-; AARCH-NEXT:    tbz w9, #0, .LBB1_7
-; AARCH-NEXT:  .LBB1_6: // %Then7
+; AARCH-NEXT:    cbz x10, .LBB1_5
+; AARCH-NEXT:  .LBB1_4: // %Else2
+; AARCH-NEXT:    cbz w9, .LBB1_6
+; AARCH-NEXT:  .LBB1_5: // %Then7
 ; AARCH-NEXT:    mov w9, #1 // =0x1
 ; AARCH-NEXT:    str w9, [x4]
-; AARCH-NEXT:  .LBB1_7: // %Block9
+; AARCH-NEXT:  .LBB1_6: // %Block9
 ; AARCH-NEXT:    mov x1, x8
 ; AARCH-NEXT:    ret
 Entry:

From 95311334c6051d3a6f8d1030bda91e4c37fdffb9 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Mon, 11 Aug 2025 03:12:47 +0000
Subject: [PATCH 06/12] resolve review comments

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 217 +++++++++-------------------
 1 file changed, 70 insertions(+), 147 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index d0cd70801c842..a3e2c1ba04e56 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -338,6 +338,10 @@ class CodeGenPrepare {
   /// Keep track of instructions removed during promotion.
   SetOfInstrs RemovedInsts;
 
+  /// Keep track of seen mul_with_overflow intrinsics to avoid
+  // reprocessing them.
+  DenseMap<Instruction *, bool> SeenMulWithOverflowInstrs;
+
   /// Keep track of sext chains based on their initial value.
   DenseMap<Value *, Instruction *> SeenChainsForSExt;
 
@@ -433,6 +437,8 @@ class CodeGenPrepare {
   bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
   bool optimizeUMulWithOverflow(Instruction *I);
   bool optimizeSMulWithOverflow(Instruction *I);
+  bool optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+                               ModifyDT &ModifiedDT);
   bool optimizeInlineAsmInst(CallInst *CS);
   bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT);
   bool optimizeExt(Instruction *&I);
@@ -774,6 +780,7 @@ bool CodeGenPrepare::_run(Function &F) {
     verifyBFIUpdates(F);
 #endif
 
+  SeenMulWithOverflowInstrs.clear();
   return EverMadeChange;
 }
 
@@ -2781,9 +2788,9 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       }
       return false;
     case Intrinsic::umul_with_overflow:
-      return optimizeUMulWithOverflow(II);
+      return optimizeMulWithOverflow(II, /*IsSigned=*/false, ModifiedDT);
     case Intrinsic::smul_with_overflow:
-      return optimizeSMulWithOverflow(II);
+      return optimizeMulWithOverflow(II, /*IsSigned=*/true, ModifiedDT);
     }
 
     SmallVector<Value *, 2> PtrOps;
@@ -6395,122 +6402,20 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
   return true;
 }
 
-// Rewrite the umul_with_overflow intrinsic by checking if both of the
+// Rewrite the mul_with_overflow intrinsic by checking if both of the
 // operands' value range is within the legal type. If so, we can optimize the
 // multiplication algorithm. This code is supposed to be written during the step
 // of type legalization, but given that we need to reconstruct the IR which is
 // not doable there, we do it here.
-bool CodeGenPrepare::optimizeUMulWithOverflow(Instruction *I) {
+bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
+                                             ModifyDT &ModifiedDT) {
   // Enable this optimization only for aarch64.
   if (!TLI->getTargetMachine().getTargetTriple().isAArch64())
     return false;
-  if (TLI->getTypeAction(
-          I->getContext(),
-          TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
-      TargetLowering::TypeExpandInteger)
-    return false;
-
-  Value *LHS = I->getOperand(0);
-  Value *RHS = I->getOperand(1);
-  auto *Ty = LHS->getType();
-  unsigned VTBitWidth = Ty->getScalarSizeInBits();
-  unsigned VTHalfBitWidth = VTBitWidth / 2;
-  auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
-
-  // Skip the optimization if the type with HalfBitWidth is not legal for the
-  // target.
-  if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) !=
-      TargetLowering::TypeLegal)
+  // If we have already seen this instruction, don't process it again.
+  if (!SeenMulWithOverflowInstrs.insert(std::make_pair(I, true)).second)
     return false;
 
-  I->getParent()->setName("overflow.res");
-  auto *OverflowResBB = I->getParent();
-  auto *OverflowoEntryBB =
-      I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
-  BasicBlock *NoOverflowBB = BasicBlock::Create(
-      I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
-  BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
-                                              I->getFunction(), OverflowResBB);
-  // new blocks should be:
-  //  entry:
-  //    (lhs_lo ne lhs_hi) || (rhs_lo ne rhs_hi) ? overflow, overflow_no
-
-  //  overflow_no:
-  //  overflow:
-  //  overflow.res:
-  //------------------------------------------------------------------------------
-  // BB overflow.entry:
-  // get Lo and Hi of RHS & LHS:
-  IRBuilder<> Builder(OverflowoEntryBB->getTerminator());
-  auto *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs.trunc");
-  auto *ShrHiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
-  auto *HiRHS = Builder.CreateTrunc(ShrHiRHS, LegalTy, "hi.rhs.trunc");
-
-  auto *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs.trunc");
-  auto *ShrHiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
-  auto *HiLHS = Builder.CreateTrunc(ShrHiLHS, LegalTy, "hi.lhs.trunc");
-
-  auto *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
-                                   ConstantInt::getNullValue(LegalTy));
-  auto *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
-                                   ConstantInt::getNullValue(LegalTy));
-  auto *Or = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
-  Builder.CreateCondBr(Or, OverflowBB, NoOverflowBB);
-  OverflowoEntryBB->getTerminator()->eraseFromParent();
-
-  //------------------------------------------------------------------------------
-  // BB overflow.no:
-  Builder.SetInsertPoint(NoOverflowBB);
-  auto *ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
-  auto *ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
-  auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
-  Builder.CreateBr(OverflowResBB);
-
-  //------------------------------------------------------------------------------
-  // BB overflow.res:
-  Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
-  auto *PHINode1 = Builder.CreatePHI(Ty, 2);
-  PHINode1->addIncoming(Mul, NoOverflowBB);
-  auto *PHINode2 =
-      Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
-  PHINode2->addIncoming(ConstantInt::getFalse(I->getContext()), NoOverflowBB);
-
-  StructType *STy = StructType::get(
-      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
-  Value *StructValOverflowRes = PoisonValue::get(STy);
-  StructValOverflowRes =
-      Builder.CreateInsertValue(StructValOverflowRes, PHINode1, {0});
-  StructValOverflowRes =
-      Builder.CreateInsertValue(StructValOverflowRes, PHINode2, {1});
-  // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
-  // uses by StructValOverflowRes.
-  I->replaceAllUsesWith(StructValOverflowRes);
-  I->removeFromParent();
-
-  // BB overflow:
-  I->insertInto(OverflowBB, OverflowBB->end());
-  Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
-  auto *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
-  auto *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
-  Builder.CreateBr(OverflowResBB);
-
-  // Add The Extracted values to the PHINodes in the overflow.res block.
-  PHINode1->addIncoming(MulOverflow, OverflowBB);
-  PHINode2->addIncoming(OverflowFlag, OverflowBB);
-
-  // return false to stop reprocessing the function.
-  return false;
-}
-
-// Rewrite the smul_with_overflow intrinsic by checking if both of the
-// operands' value range is within the legal type. If so, we can optimize the
-// multiplication algorithm. This code is supposed to be written during the step
-// of type legalization, but given that we need to reconstruct the IR which is
-// not doable there, we do it here.
-bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
-  // Enable this optimization only for aarch64.
-  if (!TLI->getTargetMachine().getTargetTriple().isAArch64())
-    return false;
   if (TLI->getTypeAction(
           I->getContext(),
           TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
@@ -6519,10 +6424,11 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
 
   Value *LHS = I->getOperand(0);
   Value *RHS = I->getOperand(1);
-  auto *Ty = LHS->getType();
+  Type *Ty = LHS->getType();
   unsigned VTBitWidth = Ty->getScalarSizeInBits();
   unsigned VTHalfBitWidth = VTBitWidth / 2;
-  auto *LegalTy = IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
+  IntegerType *LegalTy =
+      IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth);
 
   // Skip the optimization if the type with HalfBitWidth is not legal for the
   // target.
@@ -6530,9 +6436,13 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
       TargetLowering::TypeLegal)
     return false;
 
+  // Make sure that the I->getType() is a struct type with two elements.
+  if (!I->getType()->isStructTy() || I->getType()->getStructNumElements() != 2)
+    return false;
+
   I->getParent()->setName("overflow.res");
-  auto *OverflowResBB = I->getParent();
-  auto *OverflowoEntryBB =
+  BasicBlock *OverflowResBB = I->getParent();
+  BasicBlock *OverflowoEntryBB =
       I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
   BasicBlock *NoOverflowBB = BasicBlock::Create(
       I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
@@ -6540,54 +6450,67 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
                                               I->getFunction(), OverflowResBB);
   // new blocks should be:
   //  entry:
-  //    (lhs_lo ne lhs_hi) || (rhs_lo ne rhs_hi) ? overflow, overflow_no
+  //    if signed:
+  //      (lhs_lo ^ lhs_hi) || (rhs_lo ^ rhs_hi) ? overflow, overflow_no
+  //    else:
+  //      (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no
 
   //  overflow_no:
   //  overflow:
   //  overflow.res:
 
-  //------------------------------------------------------------------------------
+  // ----------------------------
   // BB overflow.entry:
-  // get Lo and Hi of RHS & LHS:
+  // get Lo and Hi of LHS & RHS:
   IRBuilder<> Builder(OverflowoEntryBB->getTerminator());
-  auto *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs");
-  auto *SignLoRHS =
-      Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
-  auto *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
+  Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
+  Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
+  HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
+  Value *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs");
+  Value *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr");
   HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs");
 
-  auto *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
-  auto *SignLoLHS =
-      Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
-  auto *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
-  HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
-  // xor(HiLHS, SignLoLHS) false -> no overflow
-  // xor(HiRHS, SignLoRHS) false -> no overflow
-  // if either of the above is true, then overflow.
-  // auto *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS, SignLoLHS);
-  auto *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS);
-  auto *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS);
-  // auto *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS, SignLoRHS);
-  auto *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs");
-  auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_EQ, Or,
-                                ConstantInt::get(Or->getType(), 1));
-  Builder.CreateCondBr(Cmp, OverflowBB, NoOverflowBB);
+  Value *IsAnyBitTrue;
+  if (IsSigned) {
+    Value *SignLoLHS =
+        Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs");
+    Value *SignLoRHS =
+        Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs");
+    Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS);
+    Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS);
+    Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs");
+    IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_EQ, Or,
+                                     ConstantInt::get(Or->getType(), 1));
+  } else {
+    Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
+                                      ConstantInt::getNullValue(LegalTy));
+    Value *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS,
+                                      ConstantInt::getNullValue(LegalTy));
+    IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
+  }
+
+  Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB);
   OverflowoEntryBB->getTerminator()->eraseFromParent();
 
-  //------------------------------------------------------------------------------
   // BB overflow.no:
   Builder.SetInsertPoint(NoOverflowBB);
-  auto *ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
-  auto *ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
-  auto *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
+  Value *ExtLoLHS, *ExtLoRHS;
+  if (IsSigned) {
+    ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
+  } else {
+    ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
+  }
+
+  Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
   Builder.CreateBr(OverflowResBB);
 
-  //------------------------------------------------------------------------------
   // BB overflow.res:
   Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
-  auto *PHINode1 = Builder.CreatePHI(Ty, 2);
+  PHINode *PHINode1 = Builder.CreatePHI(Ty, 2);
   PHINode1->addIncoming(Mul, NoOverflowBB);
-  auto *PHINode2 =
+  PHINode *PHINode2 =
       Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
   PHINode2->addIncoming(ConstantInt::getFalse(I->getContext()), NoOverflowBB);
 
@@ -6606,16 +6529,16 @@ bool CodeGenPrepare::optimizeSMulWithOverflow(Instruction *I) {
   // BB overflow:
   I->insertInto(OverflowBB, OverflowBB->end());
   Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
-  auto *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
-  auto *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
+  Value *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow");
+  Value *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag");
   Builder.CreateBr(OverflowResBB);
 
   // Add The Extracted values to the PHINodes in the overflow.res block.
   PHINode1->addIncoming(MulOverflow, OverflowBB);
   PHINode2->addIncoming(OverflowFlag, OverflowBB);
 
-  // return false to stop reprocessing the function.
-  return false;
+  ModifiedDT = ModifyDT::ModifyBBDT;
+  return true;
 }
 
 /// If there are any memory operands, use OptimizeMemoryInst to sink their

From 6ecfd1facd1ba3e906e5cdcf088318515ebb688c Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Mon, 1 Sep 2025 13:36:49 +0000
Subject: [PATCH 07/12] Resolve review comments:

For the simple case where IR just checks the overflow, skip
the check when we're sure that there is no overflow.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   4 +
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 142 ++++++++++++++----
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +
 llvm/test/CodeGen/AArch64/i128-math.ll        |  11 +-
 .../CodeGen/AArch64/i128_with_overflow.ll     |  50 +++---
 .../umulo-128-legalisation-lowering.ll        |   3 +-
 6 files changed, 146 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2ba8b29e775e0..3ba09e3685b40 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3470,6 +3470,10 @@ class LLVM_ABI TargetLoweringBase {
     return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT));
   }
 
+  // Return true if the target wants to optimize the mul overflow intrinsic
+  // by detecting if there is no overflow.
+  virtual bool shouldOptimizeMulOverflowIntrinsic() const { return false; }
+
   // Return true if it is profitable to use a scalar input to a BUILD_VECTOR
   // even if the vector itself has multiple uses.
   virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index a3e2c1ba04e56..0b5a00f6c2bee 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6409,8 +6409,7 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
 // not doable there, we do it here.
 bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
                                              ModifyDT &ModifiedDT) {
-  // Enable this optimization only for aarch64.
-  if (!TLI->getTargetMachine().getTargetTriple().isAArch64())
+  if (!TLI->shouldOptimizeMulOverflowIntrinsic())
     return false;
   // If we have already seen this instruction, don't process it again.
   if (!SeenMulWithOverflowInstrs.insert(std::make_pair(I, true)).second)
@@ -6440,29 +6439,42 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
   if (!I->getType()->isStructTy() || I->getType()->getStructNumElements() != 2)
     return false;
 
-  I->getParent()->setName("overflow.res");
-  BasicBlock *OverflowResBB = I->getParent();
-  BasicBlock *OverflowoEntryBB =
-      I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
-  BasicBlock *NoOverflowBB = BasicBlock::Create(
-      I->getContext(), "overflow.no", I->getFunction(), OverflowResBB);
-  BasicBlock *OverflowBB = BasicBlock::Create(I->getContext(), "overflow",
-                                              I->getFunction(), OverflowResBB);
-  // new blocks should be:
+  // ----------------------------
+
+  // For the simple case where IR just checks the overflow flag, new blocks
+  // should be:
   //  entry:
   //    if signed:
   //      (lhs_lo ^ lhs_hi) || (rhs_lo ^ rhs_hi) ? overflow, overflow_no
   //    else:
   //      (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no
+  //  overflow_no:
+  //  overflow:
 
+  // otherwise, new blocks should be:
+  //  entry:
+  //    if signed:
+  //      (lhs_lo ^ lhs_hi) || (rhs_lo ^ rhs_hi) ? overflow, overflow_no
+  //    else:
+  //      (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no
   //  overflow_no:
   //  overflow:
   //  overflow.res:
 
-  // ----------------------------
+  // New BBs:
+  BasicBlock *OverflowoEntryBB =
+      I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
+  // Remove the 'br' instruction that is generated as a result of the split:
+  OverflowoEntryBB->getTerminator()->eraseFromParent();
+  BasicBlock *NoOverflowBB =
+      BasicBlock::Create(I->getContext(), "overflow.no", I->getFunction());
+  NoOverflowBB->moveAfter(OverflowoEntryBB);
+  I->getParent()->setName("overflow");
+  BasicBlock *OverflowBB = I->getParent();
+
   // BB overflow.entry:
-  // get Lo and Hi of LHS & RHS:
-  IRBuilder<> Builder(OverflowoEntryBB->getTerminator());
+  // Get Lo and Hi of LHS & RHS:
+  IRBuilder<> Builder(OverflowoEntryBB);
   Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
   Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
   HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
@@ -6479,8 +6491,8 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
     Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS);
     Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS);
     Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs");
-    IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_EQ, Or,
-                                     ConstantInt::get(Or->getType(), 1));
+    IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_NE, Or,
+                                     ConstantInt::getNullValue(Or->getType()));
   } else {
     Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS,
                                       ConstantInt::getNullValue(LegalTy));
@@ -6488,9 +6500,7 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
                                       ConstantInt::getNullValue(LegalTy));
     IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs");
   }
-
   Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB);
-  OverflowoEntryBB->getTerminator()->eraseFromParent();
 
   // BB overflow.no:
   Builder.SetInsertPoint(NoOverflowBB);
@@ -6503,24 +6513,95 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
     ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
   }
 
-  Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.no.overflow");
+  Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no");
+
+  // In overflow.no BB: we are sure that the overflow flag is false.
+  // So, if we found this pattern:
+  // br (extractvalue (%mul, 1)), label %if.then, label %if.end
+  // then we can jump directly to %if.end as we're sure that there is no
+  // overflow.
+  BasicBlock *DetectNoOverflowBrBB = nullptr;
+  StructType *STy = StructType::get(
+      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
+  // Look for the pattern in the users of I, and make sure that all the users
+  // are either part of the pattern or NOT in the same BB as I.
+  for (User *U : I->users()) {
+    if (auto *Instr = dyn_cast<Instruction>(U);
+        Instr && Instr->getParent() != I->getParent())
+      continue;
+
+    if (auto *ExtUser = dyn_cast<ExtractValueInst>(U)) {
+      if (ExtUser->hasOneUse() && ExtUser->getNumIndices() == 1 &&
+          ExtUser->getIndices()[0] == 1) {
+        if (auto *Br = dyn_cast<BranchInst>(*ExtUser->user_begin())) {
+          DetectNoOverflowBrBB = Br->getSuccessor(1) /*if.end*/;
+          continue;
+        }
+      }
+    }
+    // If we come here, it means that either the pattern doesn't exist or
+    // there are multiple users in the same BB
+    DetectNoOverflowBrBB = nullptr;
+    break;
+  }
+  if (DetectNoOverflowBrBB) {
+    // BB overflow.no: jump directly to if.end BB
+    Builder.CreateBr(DetectNoOverflowBrBB);
+    // BB if.end:
+    Builder.SetInsertPoint(DetectNoOverflowBrBB,
+                           DetectNoOverflowBrBB->getFirstInsertionPt());
+    // Create PHI node to get the results of multiplication from 'overflow.no'
+    // and 'overflow' BBs
+    PHINode *NoOverflowPHI = Builder.CreatePHI(Ty, 2);
+    NoOverflowPHI->addIncoming(Mul, NoOverflowBB);
+    // Create struct value to replace all uses of I
+    Value *StructValNoOverflow = PoisonValue::get(STy);
+    StructValNoOverflow =
+        Builder.CreateInsertValue(StructValNoOverflow, NoOverflowPHI, {0});
+    // Overflow flag is always false as we are sure it's not overflow.
+    StructValNoOverflow = Builder.CreateInsertValue(
+        StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
+    // Replace all uses of I, only uses dominated by the if.end BB
+    I->replaceUsesOutsideBlock(StructValNoOverflow, I->getParent());
+    // BB overflow:
+    Builder.SetInsertPoint(OverflowBB,
+                           I->getParent()->getTerminator()->getIterator());
+    // Extract the multiplication result to add it to the PHI node in the if.end
+    // BB
+    Value *IntrinsicMulRes = Builder.CreateExtractValue(I, {0}, "mul.extract");
+    NoOverflowPHI->addIncoming(IntrinsicMulRes, OverflowBB);
+    ModifiedDT = ModifyDT::ModifyBBDT;
+    return true;
+  }
+
+  // Otherwise, we need to create the 'overflow.res' BB to merge the results of
+  // the two paths.
+  I->getParent()->setName("overflow.res");
+  BasicBlock *OverflowResBB = I->getParent();
+  OverflowBB = BasicBlock::Create(I->getContext(), "overflow", I->getFunction(),
+                                  OverflowResBB);
+  // Initially I->getParent() was the overflow BB, now it becomes the
+  // overflow.res BB. So we need to keep the old reference to the overflow BB.
+  OverflowResBB->replaceAllUsesWith(OverflowBB);
+
+  // BB overflow.no: jump to overflow.res BB
   Builder.CreateBr(OverflowResBB);
 
   // BB overflow.res:
   Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt());
-  PHINode *PHINode1 = Builder.CreatePHI(Ty, 2);
-  PHINode1->addIncoming(Mul, NoOverflowBB);
-  PHINode *PHINode2 =
-      Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
-  PHINode2->addIncoming(ConstantInt::getFalse(I->getContext()), NoOverflowBB);
+  PHINode *OverflowResPHI = Builder.CreatePHI(Ty, 2),
+          *OverflowFlagPHI =
+              Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2);
 
-  StructType *STy = StructType::get(
-      I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())});
   Value *StructValOverflowRes = PoisonValue::get(STy);
   StructValOverflowRes =
-      Builder.CreateInsertValue(StructValOverflowRes, PHINode1, {0});
+      Builder.CreateInsertValue(StructValOverflowRes, OverflowResPHI, {0});
   StructValOverflowRes =
-      Builder.CreateInsertValue(StructValOverflowRes, PHINode2, {1});
+      Builder.CreateInsertValue(StructValOverflowRes, OverflowFlagPHI, {1});
+  OverflowResPHI->addIncoming(Mul, NoOverflowBB);
+  OverflowFlagPHI->addIncoming(ConstantInt::getFalse(I->getContext()),
+                               NoOverflowBB);
+
   // Before moving the mul.overflow intrinsic to the overflowBB, replace all its
   // uses by StructValOverflowRes.
   I->replaceAllUsesWith(StructValOverflowRes);
@@ -6534,9 +6615,8 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
   Builder.CreateBr(OverflowResBB);
 
   // Add The Extracted values to the PHINodes in the overflow.res block.
-  PHINode1->addIncoming(MulOverflow, OverflowBB);
-  PHINode2->addIncoming(OverflowFlag, OverflowBB);
-
+  OverflowResPHI->addIncoming(MulOverflow, OverflowBB);
+  OverflowFlagPHI->addIncoming(OverflowFlag, OverflowBB);
   ModifiedDT = ModifyDT::ModifyBBDT;
   return true;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f5d14905cac66..36938a493e923 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -321,6 +321,10 @@ class AArch64TargetLowering : public TargetLowering {
     return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
   }
 
+  // Return true if the target wants to optimize the mul overflow intrinsic
+  // by detecting if there is no overflow.
+  bool shouldOptimizeMulOverflowIntrinsic() const override { return true; }
+
   Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index d7e71dc51dcb5..df526b76e7356 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -383,8 +383,7 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    b.ne .LBB21_2
+; CHECK-NEXT:    cbz x8, .LBB21_2
 ; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
@@ -418,7 +417,7 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:  .LBB21_2: // %overflow.no
 ; CHECK-NEXT:    smulh x1, x0, x2
 ; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    eor w2, wzr, #0x1
+; CHECK-NEXT:    eor w2, w8, #0x1
 ; CHECK-NEXT:    ret
   %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
   %2 = extractvalue { i128, i1 } %1, 0
@@ -436,8 +435,7 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    b.ne .LBB22_2
+; CHECK-NEXT:    cbz x8, .LBB22_2
 ; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
@@ -487,8 +485,7 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    b.ne .LBB23_2
+; CHECK-NEXT:    cbz x8, .LBB23_2
 ; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 75e76472905c2..8cc2fcc362882 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -225,7 +225,7 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_umul_i128:
 ; CHECK:       // %bb.0: // %overflow.entry
 ; CHECK-NEXT:    orr x8, x1, x3
-; CHECK-NEXT:    cbz x8, .LBB4_2
+; CHECK-NEXT:    cbz x8, .LBB4_3
 ; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    mul x9, x3, x0
 ; CHECK-NEXT:    cmp x1, #0
@@ -236,17 +236,12 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    ccmp xzr, x10, #0, eq
 ; CHECK-NEXT:    umulh x11, x0, x2
 ; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    adds x1, x11, x9
 ; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    cbnz w8, .LBB4_3
-; CHECK-NEXT:    b .LBB4_4
-; CHECK-NEXT:  .LBB4_2: // %overflow.no
-; CHECK-NEXT:    umulh x1, x0, x2
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    cbz w8, .LBB4_4
-; CHECK-NEXT:  .LBB4_3: // %if.then
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    b.ne .LBB4_4
+; CHECK-NEXT:  // %bb.2: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -255,7 +250,11 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:  .LBB4_4: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB4_3: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:  .LBB4_4:
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
@@ -282,8 +281,7 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cmp x8, #1
-; CHECK-NEXT:    b.ne .LBB5_2
+; CHECK-NEXT:    cbz x8, .LBB5_3
 ; CHECK-NEXT:  // %bb.1: // %overflow
 ; CHECK-NEXT:    asr x9, x1, #63
 ; CHECK-NEXT:    umulh x10, x0, x2
@@ -304,22 +302,14 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    adc x10, x12, x13
 ; CHECK-NEXT:    asr x12, x10, #63
 ; CHECK-NEXT:    adds x8, x8, x10
-; CHECK-NEXT:    asr x10, x1, #63
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    adc x11, x11, x12
+; CHECK-NEXT:    adc x10, x11, x12
 ; CHECK-NEXT:    adds x8, x15, x8
-; CHECK-NEXT:    adc x9, x9, x11
-; CHECK-NEXT:    cmp x8, x10
-; CHECK-NEXT:    ccmp x9, x10, #0, eq
-; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    cbnz w8, .LBB5_3
-; CHECK-NEXT:    b .LBB5_4
-; CHECK-NEXT:  .LBB5_2: // %overflow.no
-; CHECK-NEXT:    smulh x1, x0, x2
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    cbz w8, .LBB5_4
-; CHECK-NEXT:  .LBB5_3: // %if.then
+; CHECK-NEXT:    asr x11, x1, #63
+; CHECK-NEXT:    adc x9, x9, x10
+; CHECK-NEXT:    cmp x9, x11
+; CHECK-NEXT:    ccmp x8, x11, #0, eq
+; CHECK-NEXT:    b.eq .LBB5_4
+; CHECK-NEXT:  // %bb.2: // %if.then
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
@@ -328,7 +318,11 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-NEXT:    sxtw x0, w0
 ; CHECK-NEXT:    asr x1, x0, #63
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:  .LBB5_4: // %cleanup
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB5_3: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:  .LBB5_4:
+; CHECK-NEXT:    mul x0, x0, x2
 ; CHECK-NEXT:    ret
 entry:
   %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index f98438593262f..0c06bd4ab277f 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -47,8 +47,7 @@ define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4
 ; AARCH-NEXT:    eor x9, x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]
 ; AARCH-NEXT:    orr x8, x9, x8
-; AARCH-NEXT:    cmp x8, #1
-; AARCH-NEXT:    b.ne .LBB1_2
+; AARCH-NEXT:    cbz x8, .LBB1_2
 ; AARCH-NEXT:  // %bb.1: // %overflow
 ; AARCH-NEXT:    asr x9, x1, #63
 ; AARCH-NEXT:    umulh x10, x0, x2

From d8787246deab2b8f4902c4efdbf99d9da64829c4 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Fri, 5 Sep 2025 16:12:56 +0000
Subject: [PATCH 08/12] fix issue related to Blocks

Change-Id: I4afe203a6cedb0134812143e7211ca9e80ce6687
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 52 ++++++++++++++---------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 0b5a00f6c2bee..69d12aeb77d15 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -435,8 +435,6 @@ class CodeGenPrepare {
   bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy,
                           unsigned AddrSpace);
   bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
-  bool optimizeUMulWithOverflow(Instruction *I);
-  bool optimizeSMulWithOverflow(Instruction *I);
   bool optimizeMulWithOverflow(Instruction *I, bool IsSigned,
                                ModifyDT &ModifiedDT);
   bool optimizeInlineAsmInst(CallInst *CS);
@@ -6462,19 +6460,21 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
   //  overflow.res:
 
   // New BBs:
-  BasicBlock *OverflowoEntryBB =
+  std::string KeepBBName = I->getParent()->getName().str();
+  BasicBlock *OverflowEntryBB =
       I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true);
   // Remove the 'br' instruction that is generated as a result of the split:
-  OverflowoEntryBB->getTerminator()->eraseFromParent();
+  OverflowEntryBB->getTerminator()->eraseFromParent();
   BasicBlock *NoOverflowBB =
       BasicBlock::Create(I->getContext(), "overflow.no", I->getFunction());
-  NoOverflowBB->moveAfter(OverflowoEntryBB);
-  I->getParent()->setName("overflow");
-  BasicBlock *OverflowBB = I->getParent();
+  NoOverflowBB->moveAfter(OverflowEntryBB);
+  BasicBlock *OverflowBB =
+      BasicBlock::Create(I->getContext(), "overflow", I->getFunction());
+  OverflowBB->moveAfter(NoOverflowBB);
 
   // BB overflow.entry:
+  IRBuilder<> Builder(OverflowEntryBB);
   // Get Lo and Hi of LHS & RHS:
-  IRBuilder<> Builder(OverflowoEntryBB);
   Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs");
   Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr");
   HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs");
@@ -6504,16 +6504,7 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
 
   // BB overflow.no:
   Builder.SetInsertPoint(NoOverflowBB);
-  Value *ExtLoLHS, *ExtLoRHS;
-  if (IsSigned) {
-    ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
-    ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
-  } else {
-    ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
-    ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
-  }
-
-  Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no");
+  Value *Mul = Builder.CreateMul(LHS, RHS, "mul.overflow.no");
 
   // In overflow.no BB: we are sure that the overflow flag is false.
   // So, if we found this pattern:
@@ -6547,6 +6538,7 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
   if (DetectNoOverflowBrBB) {
     // BB overflow.no: jump directly to if.end BB
     Builder.CreateBr(DetectNoOverflowBrBB);
+
     // BB if.end:
     Builder.SetInsertPoint(DetectNoOverflowBrBB,
                            DetectNoOverflowBrBB->getFirstInsertionPt());
@@ -6563,26 +6555,30 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
         StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1});
     // Replace all uses of I, only uses dominated by the if.end BB
     I->replaceUsesOutsideBlock(StructValNoOverflow, I->getParent());
+
+    // Remove the original BB as it's divided into 'overflow.entry' and
+    // 'overflow' BBs.
+    BasicBlock *ToBeRemoveBB = I->getParent();
     // BB overflow:
-    Builder.SetInsertPoint(OverflowBB,
-                           I->getParent()->getTerminator()->getIterator());
+    OverflowBB->splice(OverflowBB->end(), ToBeRemoveBB);
     // Extract the multiplication result to add it to the PHI node in the if.end
     // BB
+    Builder.SetInsertPoint(OverflowBB, OverflowBB->end());
     Value *IntrinsicMulRes = Builder.CreateExtractValue(I, {0}, "mul.extract");
+    cast<Instruction>(IntrinsicMulRes)->moveAfter(I);
     NoOverflowPHI->addIncoming(IntrinsicMulRes, OverflowBB);
+
+    ToBeRemoveBB->eraseFromParent();
+    // Restore the original name of the overflow.entry BB:
+    OverflowEntryBB->setName(KeepBBName);
     ModifiedDT = ModifyDT::ModifyBBDT;
     return true;
   }
 
   // Otherwise, we need to create the 'overflow.res' BB to merge the results of
-  // the two paths.
+  // the two paths:
   I->getParent()->setName("overflow.res");
   BasicBlock *OverflowResBB = I->getParent();
-  OverflowBB = BasicBlock::Create(I->getContext(), "overflow", I->getFunction(),
-                                  OverflowResBB);
-  // Initially I->getParent() was the overflow BB, now it becomes the
-  // overflow.res BB. So we need to keep the old reference to the overflow BB.
-  OverflowResBB->replaceAllUsesWith(OverflowBB);
 
   // BB overflow.no: jump to overflow.res BB
   Builder.CreateBr(OverflowResBB);
@@ -6617,6 +6613,10 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
   // Add The Extracted values to the PHINodes in the overflow.res block.
   OverflowResPHI->addIncoming(MulOverflow, OverflowBB);
   OverflowFlagPHI->addIncoming(OverflowFlag, OverflowBB);
+
+  // Restore the original name of the overflow.entry BB:
+  OverflowEntryBB->setName(KeepBBName);
+
   ModifiedDT = ModifyDT::ModifyBBDT;
   return true;
 }

From 0610edf45df5933dd0872bb6b1131babd276d4e9 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Tue, 9 Sep 2025 11:45:35 +0100
Subject: [PATCH 09/12] update test cases

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp                  | 11 ++++++++++-
 llvm/test/CodeGen/AArch64/i128-math.ll               | 12 ++++++------
 llvm/test/CodeGen/AArch64/i128_with_overflow.ll      |  4 ++--
 .../AArch64/umulo-128-legalisation-lowering.ll       |  4 ++--
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 69d12aeb77d15..f85976c4961ac 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6504,7 +6504,16 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
 
   // BB overflow.no:
   Builder.SetInsertPoint(NoOverflowBB);
-  Value *Mul = Builder.CreateMul(LHS, RHS, "mul.overflow.no");
+    Value *ExtLoLHS, *ExtLoRHS;
+  if (IsSigned) {
+    ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");
+  } else {
+    ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext");
+    ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext");
+  }
+
+  Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no");
 
   // In overflow.no BB: we are sure that the overflow flag is false.
   // So, if we found this pattern:
diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll
index df526b76e7356..12ae241dda4bd 100644
--- a/llvm/test/CodeGen/AArch64/i128-math.ll
+++ b/llvm/test/CodeGen/AArch64/i128-math.ll
@@ -261,7 +261,7 @@ define i128 @u128_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_checked_mul:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    orr x8, x1, x3
 ; CHECK-NEXT:    cbz x8, .LBB17_2
 ; CHECK-NEXT:  // %bb.1: // %overflow
@@ -297,7 +297,7 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_overflowing_mul:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    orr x8, x1, x3
 ; CHECK-NEXT:    cbz x8, .LBB18_2
 ; CHECK-NEXT:  // %bb.1: // %overflow
@@ -331,7 +331,7 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) {
 
 define i128 @u128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: u128_saturating_mul:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    orr x8, x1, x3
 ; CHECK-NEXT:    cbz x8, .LBB19_2
 ; CHECK-NEXT:  // %bb.1: // %overflow
@@ -379,7 +379,7 @@ define i128 @i128_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_checked_mul:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
@@ -431,7 +431,7 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
 
 define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_overflowing_mul:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
@@ -481,7 +481,7 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
 
 define i128 @i128_saturating_mul(i128 %x, i128 %y) {
 ; CHECK-LABEL: i128_saturating_mul:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0:
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
index 8cc2fcc362882..12c1b05dd41e0 100644
--- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
+++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll
@@ -223,7 +223,7 @@ cleanup:
 
 define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_umul_i128:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    orr x8, x1, x3
 ; CHECK-NEXT:    cbz x8, .LBB4_3
 ; CHECK-NEXT:  // %bb.1: // %overflow
@@ -277,7 +277,7 @@ cleanup:
 
 define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
 ; CHECK-LABEL: test_smul_i128:
-; CHECK:       // %bb.0: // %overflow.entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    eor x8, x3, x2, asr #63
 ; CHECK-NEXT:    eor x9, x1, x0, asr #63
 ; CHECK-NEXT:    orr x8, x9, x8
diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
index 0c06bd4ab277f..ace0c83e63c7c 100644
--- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -3,7 +3,7 @@
 
 define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; AARCH-LABEL: muloti_test:
-; AARCH:       // %bb.0: // %overflow.entry
+; AARCH:       // %bb.0: // %start
 ; AARCH-NEXT:    orr x8, x1, x3
 ; AARCH-NEXT:    cbz x8, .LBB0_2
 ; AARCH-NEXT:  // %bb.1: // %overflow
@@ -42,7 +42,7 @@ start:
 
 define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
 ; AARCH-LABEL: __muloti4:
-; AARCH:       // %bb.0: // %overflow.entry
+; AARCH:       // %bb.0: // %Entry
 ; AARCH-NEXT:    eor x8, x3, x2, asr #63
 ; AARCH-NEXT:    eor x9, x1, x0, asr #63
 ; AARCH-NEXT:    str wzr, [x4]

From 9fa49277edbd6ee01df24ebacc254080b88eb0b9 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Tue, 9 Sep 2025 12:03:56 +0100
Subject: [PATCH 10/12] format

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index f85976c4961ac..c8bfbf3e7484d 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6504,7 +6504,7 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
 
   // BB overflow.no:
   Builder.SetInsertPoint(NoOverflowBB);
-    Value *ExtLoLHS, *ExtLoRHS;
+  Value *ExtLoLHS, *ExtLoRHS;
   if (IsSigned) {
     ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext");
     ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext");

From 08c498cfa4381dd799ff3c0b5498b929d80165b0 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Tue, 9 Sep 2025 22:17:43 +0100
Subject: [PATCH 11/12] detect optimization pattern to stop reprocessing

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  38 +++--
 .../test/CodeGen/AArch64/mul_with_overflow.ll | 132 ++++++++++++++++++
 2 files changed, 162 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/mul_with_overflow.ll

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index c8bfbf3e7484d..70605e0d41840 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -338,10 +338,6 @@ class CodeGenPrepare {
   /// Keep track of instructions removed during promotion.
   SetOfInstrs RemovedInsts;
 
-  /// Keep track of seen mul_with_overflow intrinsics to avoid
-  // reprocessing them.
-  DenseMap<Instruction *, bool> SeenMulWithOverflowInstrs;
-
   /// Keep track of sext chains based on their initial value.
   DenseMap<Value *, Instruction *> SeenChainsForSExt;
 
@@ -778,7 +774,6 @@ bool CodeGenPrepare::_run(Function &F) {
     verifyBFIUpdates(F);
 #endif
 
-  SeenMulWithOverflowInstrs.clear();
   return EverMadeChange;
 }
 
@@ -6409,9 +6404,36 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
                                              ModifyDT &ModifiedDT) {
   if (!TLI->shouldOptimizeMulOverflowIntrinsic())
     return false;
-  // If we have already seen this instruction, don't process it again.
-  if (!SeenMulWithOverflowInstrs.insert(std::make_pair(I, true)).second)
-    return false;
+
+  // Check if we had already optimized this intrinsic by detecting the pattern of the changes we had made:
+  // Check if we are testing the high bits of the operands:
+  if (BasicBlock *BrBB = I->getParent()->getSinglePredecessor()) {
+    if (BranchInst *Br = dyn_cast<BranchInst>(BrBB->getTerminator()); Br && Br->isConditional()) {
+      if (IsSigned) {
+        // Check: cmp(or(xor(trunc(lshr(x))), xor(trunc(lshr(x)))))
+        if (match(Br->getCondition(),
+                  m_Cmp(m_Or(m_Xor(m_Trunc(m_LShr(m_Specific(I->getOperand(0)), m_Value())), m_Value()),
+                             m_Xor(m_Trunc(m_LShr(m_Specific(I->getOperand(1)), m_Value())), m_Value())),
+                        m_Value()))) {
+          LLVM_DEBUG(dbgs() << "CGP: pattern detected - bail out\n");
+          // Pattern detected, bail out.
+          return false;
+        }
+      }
+      else
+      {
+        // Check: or(cmp(trunc(lshr(x)), cmp(trunc(lshr(y))))
+        if (match(Br->getCondition(),
+                  m_Or(m_Cmp(m_Trunc(m_LShr(m_Specific(I->getOperand(0)), m_Value())), m_Value()),
+                       m_Cmp(m_Trunc(m_LShr(m_Specific(I->getOperand(1)), m_Value())), m_Value())))) {
+          LLVM_DEBUG(dbgs() << "CGP: pattern detected - bail out\n");
+          // Pattern detected, bail out.
+          return false;
+        }
+      }
+
+    }
+  }
 
   if (TLI->getTypeAction(
           I->getContext(),
diff --git a/llvm/test/CodeGen/AArch64/mul_with_overflow.ll b/llvm/test/CodeGen/AArch64/mul_with_overflow.ll
new file mode 100644
index 0000000000000..0d32cb7d2f5d1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/mul_with_overflow.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -debug-only=codegenprepare 2>%t | FileCheck %s --check-prefixes=CHECK
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+
+; DEBUG: CGP: pattern detected - bail out
+
+define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
+; CHECK-LABEL: test_umul_i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    orr x8, x1, x3
+; CHECK-NEXT:    cbz x8, .LBB0_3
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    mul x9, x3, x0
+; CHECK-NEXT:    cmp x1, #0
+; CHECK-NEXT:    ccmp x3, #0, #4, ne
+; CHECK-NEXT:    umulh x10, x1, x2
+; CHECK-NEXT:    umulh x8, x3, x0
+; CHECK-NEXT:    madd x9, x1, x2, x9
+; CHECK-NEXT:    ccmp xzr, x10, #0, eq
+; CHECK-NEXT:    umulh x11, x0, x2
+; CHECK-NEXT:    ccmp xzr, x8, #0, eq
+; CHECK-NEXT:    cset w8, ne
+; CHECK-NEXT:    adds x1, x11, x9
+; CHECK-NEXT:    csinc w8, w8, wzr, lo
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    b.ne .LBB0_4
+; CHECK-NEXT:  // %bb.2: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_3: // %overflow.no
+; CHECK-NEXT:    umulh x1, x0, x2
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 1
+  br i1 %1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+if.end:
+  %2 = extractvalue { i128, i1 } %0, 0
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ]
+  ret i128 %retval.0
+}
+
+; DEBUG: CGP: pattern detected - bail out
+
+define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
+; CHECK-LABEL: test_smul_i128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    eor x8, x3, x2, asr #63
+; CHECK-NEXT:    eor x9, x1, x0, asr #63
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cbz x8, .LBB1_3
+; CHECK-NEXT:  // %bb.1: // %overflow
+; CHECK-NEXT:    asr x9, x1, #63
+; CHECK-NEXT:    umulh x10, x0, x2
+; CHECK-NEXT:    asr x13, x3, #63
+; CHECK-NEXT:    mul x11, x1, x2
+; CHECK-NEXT:    umulh x8, x1, x2
+; CHECK-NEXT:    mul x9, x9, x2
+; CHECK-NEXT:    adds x10, x11, x10
+; CHECK-NEXT:    mul x14, x0, x3
+; CHECK-NEXT:    umulh x12, x0, x3
+; CHECK-NEXT:    adc x8, x8, x9
+; CHECK-NEXT:    mov x9, x1
+; CHECK-NEXT:    mul x13, x0, x13
+; CHECK-NEXT:    asr x11, x8, #63
+; CHECK-NEXT:    mul x15, x1, x3
+; CHECK-NEXT:    adds x1, x14, x10
+; CHECK-NEXT:    smulh x9, x9, x3
+; CHECK-NEXT:    adc x10, x12, x13
+; CHECK-NEXT:    asr x12, x10, #63
+; CHECK-NEXT:    adds x8, x8, x10
+; CHECK-NEXT:    adc x10, x11, x12
+; CHECK-NEXT:    adds x8, x15, x8
+; CHECK-NEXT:    asr x11, x1, #63
+; CHECK-NEXT:    adc x9, x9, x10
+; CHECK-NEXT:    cmp x9, x11
+; CHECK-NEXT:    ccmp x8, x11, #0, eq
+; CHECK-NEXT:    b.eq .LBB1_4
+; CHECK-NEXT:  // %bb.2: // %if.then
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl error
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sxtw x0, w0
+; CHECK-NEXT:    asr x1, x0, #63
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_3: // %overflow.no
+; CHECK-NEXT:    smulh x1, x0, x2
+; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    mul x0, x0, x2
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
+  %1 = extractvalue { i128, i1 } %0, 1
+  br i1 %1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @error()
+  %conv1 = sext i32 %call to i128
+  br label %cleanup
+
+if.end:
+  %2 = extractvalue { i128, i1 } %0, 0
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ]
+  ret i128 %retval.0
+}
+
+declare i32 @error()

From 8607d5baebfa2635acdfc609bb4224c628b55db1 Mon Sep 17 00:00:00 2001
From: Hassnaa Hamdi <hassnaa.hamdi@arm.com>
Date: Mon, 15 Sep 2025 14:04:37 +0000
Subject: [PATCH 12/12] Discard the recent work of detecting the optimization
 pattern to stop reprocessing, and instead use the current list of
 'InsertedInsts' to keep track of processed instructions. The work of
 detecting the pattern needs to detect 4 patterns for cases when one/both of
 the parameters are constant. So the new solution is simpler and more secure.

---
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  32 +----
 .../test/CodeGen/AArch64/mul_with_overflow.ll | 132 ------------------
 2 files changed, 2 insertions(+), 162 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/mul_with_overflow.ll

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 70605e0d41840..0d2a68ef774a0 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6405,36 +6405,6 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
   if (!TLI->shouldOptimizeMulOverflowIntrinsic())
     return false;
 
-  // Check if we had already optimized this intrinsic by detecting the pattern of the changes we had made:
-  // Check if we are testing the high bits of the operands:
-  if (BasicBlock *BrBB = I->getParent()->getSinglePredecessor()) {
-    if (BranchInst *Br = dyn_cast<BranchInst>(BrBB->getTerminator()); Br && Br->isConditional()) {
-      if (IsSigned) {
-        // Check: cmp(or(xor(trunc(lshr(x))), xor(trunc(lshr(x)))))
-        if (match(Br->getCondition(),
-                  m_Cmp(m_Or(m_Xor(m_Trunc(m_LShr(m_Specific(I->getOperand(0)), m_Value())), m_Value()),
-                             m_Xor(m_Trunc(m_LShr(m_Specific(I->getOperand(1)), m_Value())), m_Value())),
-                        m_Value()))) {
-          LLVM_DEBUG(dbgs() << "CGP: pattern detected - bail out\n");
-          // Pattern detected, bail out.
-          return false;
-        }
-      }
-      else
-      {
-        // Check: or(cmp(trunc(lshr(x)), cmp(trunc(lshr(y))))
-        if (match(Br->getCondition(),
-                  m_Or(m_Cmp(m_Trunc(m_LShr(m_Specific(I->getOperand(0)), m_Value())), m_Value()),
-                       m_Cmp(m_Trunc(m_LShr(m_Specific(I->getOperand(1)), m_Value())), m_Value())))) {
-          LLVM_DEBUG(dbgs() << "CGP: pattern detected - bail out\n");
-          // Pattern detected, bail out.
-          return false;
-        }
-      }
-
-    }
-  }
-
   if (TLI->getTypeAction(
           I->getContext(),
           TLI->getValueType(*DL, I->getType()->getContainedType(0))) !=
@@ -6459,6 +6429,8 @@ bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned,
   if (!I->getType()->isStructTy() || I->getType()->getStructNumElements() != 2)
     return false;
 
+  // Keep track of the instruction to stop reoptimizing it again.
+  InsertedInsts.insert(I);
   // ----------------------------
 
   // For the simple case where IR just checks the overflow flag, new blocks
diff --git a/llvm/test/CodeGen/AArch64/mul_with_overflow.ll b/llvm/test/CodeGen/AArch64/mul_with_overflow.ll
deleted file mode 100644
index 0d32cb7d2f5d1..0000000000000
--- a/llvm/test/CodeGen/AArch64/mul_with_overflow.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -debug-only=codegenprepare 2>%t | FileCheck %s --check-prefixes=CHECK
-; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
-
-; DEBUG: CGP: pattern detected - bail out
-
-define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) {
-; CHECK-LABEL: test_umul_i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    orr x8, x1, x3
-; CHECK-NEXT:    cbz x8, .LBB0_3
-; CHECK-NEXT:  // %bb.1: // %overflow
-; CHECK-NEXT:    mul x9, x3, x0
-; CHECK-NEXT:    cmp x1, #0
-; CHECK-NEXT:    ccmp x3, #0, #4, ne
-; CHECK-NEXT:    umulh x10, x1, x2
-; CHECK-NEXT:    umulh x8, x3, x0
-; CHECK-NEXT:    madd x9, x1, x2, x9
-; CHECK-NEXT:    ccmp xzr, x10, #0, eq
-; CHECK-NEXT:    umulh x11, x0, x2
-; CHECK-NEXT:    ccmp xzr, x8, #0, eq
-; CHECK-NEXT:    cset w8, ne
-; CHECK-NEXT:    adds x1, x11, x9
-; CHECK-NEXT:    csinc w8, w8, wzr, lo
-; CHECK-NEXT:    cmp w8, #1
-; CHECK-NEXT:    b.ne .LBB0_4
-; CHECK-NEXT:  // %bb.2: // %if.then
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl error
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x0, w0
-; CHECK-NEXT:    asr x1, x0, #63
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB0_3: // %overflow.no
-; CHECK-NEXT:    umulh x1, x0, x2
-; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
-entry:
-  %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y)
-  %1 = extractvalue { i128, i1 } %0, 1
-  br i1 %1, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @error()
-  %conv1 = sext i32 %call to i128
-  br label %cleanup
-
-if.end:
-  %2 = extractvalue { i128, i1 } %0, 0
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ]
-  ret i128 %retval.0
-}
-
-; DEBUG: CGP: pattern detected - bail out
-
-define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) {
-; CHECK-LABEL: test_smul_i128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    eor x8, x3, x2, asr #63
-; CHECK-NEXT:    eor x9, x1, x0, asr #63
-; CHECK-NEXT:    orr x8, x9, x8
-; CHECK-NEXT:    cbz x8, .LBB1_3
-; CHECK-NEXT:  // %bb.1: // %overflow
-; CHECK-NEXT:    asr x9, x1, #63
-; CHECK-NEXT:    umulh x10, x0, x2
-; CHECK-NEXT:    asr x13, x3, #63
-; CHECK-NEXT:    mul x11, x1, x2
-; CHECK-NEXT:    umulh x8, x1, x2
-; CHECK-NEXT:    mul x9, x9, x2
-; CHECK-NEXT:    adds x10, x11, x10
-; CHECK-NEXT:    mul x14, x0, x3
-; CHECK-NEXT:    umulh x12, x0, x3
-; CHECK-NEXT:    adc x8, x8, x9
-; CHECK-NEXT:    mov x9, x1
-; CHECK-NEXT:    mul x13, x0, x13
-; CHECK-NEXT:    asr x11, x8, #63
-; CHECK-NEXT:    mul x15, x1, x3
-; CHECK-NEXT:    adds x1, x14, x10
-; CHECK-NEXT:    smulh x9, x9, x3
-; CHECK-NEXT:    adc x10, x12, x13
-; CHECK-NEXT:    asr x12, x10, #63
-; CHECK-NEXT:    adds x8, x8, x10
-; CHECK-NEXT:    adc x10, x11, x12
-; CHECK-NEXT:    adds x8, x15, x8
-; CHECK-NEXT:    asr x11, x1, #63
-; CHECK-NEXT:    adc x9, x9, x10
-; CHECK-NEXT:    cmp x9, x11
-; CHECK-NEXT:    ccmp x8, x11, #0, eq
-; CHECK-NEXT:    b.eq .LBB1_4
-; CHECK-NEXT:  // %bb.2: // %if.then
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl error
-; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK-NEXT:    sxtw x0, w0
-; CHECK-NEXT:    asr x1, x0, #63
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB1_3: // %overflow.no
-; CHECK-NEXT:    smulh x1, x0, x2
-; CHECK-NEXT:  .LBB1_4:
-; CHECK-NEXT:    mul x0, x0, x2
-; CHECK-NEXT:    ret
-entry:
-  %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
-  %1 = extractvalue { i128, i1 } %0, 1
-  br i1 %1, label %if.then, label %if.end
-
-if.then:
-  %call = tail call i32 @error()
-  %conv1 = sext i32 %call to i128
-  br label %cleanup
-
-if.end:
-  %2 = extractvalue { i128, i1 } %0, 0
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ]
-  ret i128 %retval.0
-}
-
-declare i32 @error()