diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 2ba8b29e775e0..3ba09e3685b40 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3470,6 +3470,10 @@ class LLVM_ABI TargetLoweringBase { return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT)); } + // Return true if the target wants to optimize the mul overflow intrinsic + // by detecting if there is no overflow. + virtual bool shouldOptimizeMulOverflowIntrinsic() const { return false; } + // Return true if it is profitable to use a scalar input to a BUILD_VECTOR // even if the vector itself has multiple uses. virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const { diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 9db4c9e5e2807..0d2a68ef774a0 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -431,6 +431,8 @@ class CodeGenPrepare { bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); + bool optimizeMulWithOverflow(Instruction *I, bool IsSigned, + ModifyDT &ModifiedDT); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT); bool optimizeExt(Instruction *&I); @@ -2778,6 +2780,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { } } return false; + case Intrinsic::umul_with_overflow: + return optimizeMulWithOverflow(II, /*IsSigned=*/false, ModifiedDT); + case Intrinsic::smul_with_overflow: + return optimizeMulWithOverflow(II, /*IsSigned=*/true, ModifiedDT); } SmallVector PtrOps; @@ -6389,6 +6395,235 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, return true; } +// Rewrite the mul_with_overflow intrinsic by checking if both of the +// operands' value range is within the legal type. If so, we can optimize the +// multiplication algorithm. This code is supposed to be written during the step +// of type legalization, but given that we need to reconstruct the IR which is +// not doable there, we do it here. +bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned, + ModifyDT &ModifiedDT) { + if (!TLI->shouldOptimizeMulOverflowIntrinsic()) + return false; + + if (TLI->getTypeAction( + I->getContext(), + TLI->getValueType(*DL, I->getType()->getContainedType(0))) != + TargetLowering::TypeExpandInteger) + return false; + + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + Type *Ty = LHS->getType(); + unsigned VTBitWidth = Ty->getScalarSizeInBits(); + unsigned VTHalfBitWidth = VTBitWidth / 2; + IntegerType *LegalTy = + IntegerType::getIntNTy(I->getContext(), VTHalfBitWidth); + + // Skip the optimization if the type with HalfBitWidth is not legal for the + // target. + if (TLI->getTypeAction(I->getContext(), TLI->getValueType(*DL, LegalTy)) != + TargetLowering::TypeLegal) + return false; + + // Make sure that the I->getType() is a struct type with two elements. + if (!I->getType()->isStructTy() || I->getType()->getStructNumElements() != 2) + return false; + + // Keep track of the instruction to stop reoptimizing it again. + InsertedInsts.insert(I); + // ---------------------------- + + // For the simple case where IR just checks the overflow flag, new blocks + // should be: + // entry: + // if signed: + // (lhs_lo ^ lhs_hi) || (rhs_lo ^ rhs_hi) ? overflow, overflow_no + // else: + // (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no + // overflow_no: + // overflow: + + // otherwise, new blocks should be: + // entry: + // if signed: + // (lhs_lo ^ lhs_hi) || (rhs_lo ^ rhs_hi) ? overflow, overflow_no + // else: + // (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no + // overflow_no: + // overflow: + // overflow.res: + + // New BBs: + std::string KeepBBName = I->getParent()->getName().str(); + BasicBlock *OverflowEntryBB = + I->getParent()->splitBasicBlock(I, "overflow.entry", /*Before*/ true); + // Remove the 'br' instruction that is generated as a result of the split: + OverflowEntryBB->getTerminator()->eraseFromParent(); + BasicBlock *NoOverflowBB = + BasicBlock::Create(I->getContext(), "overflow.no", I->getFunction()); + NoOverflowBB->moveAfter(OverflowEntryBB); + BasicBlock *OverflowBB = + BasicBlock::Create(I->getContext(), "overflow", I->getFunction()); + OverflowBB->moveAfter(NoOverflowBB); + + // BB overflow.entry: + IRBuilder<> Builder(OverflowEntryBB); + // Get Lo and Hi of LHS & RHS: + Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs"); + Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr"); + HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs"); + Value *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs"); + Value *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr"); + HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs"); + + Value *IsAnyBitTrue; + if (IsSigned) { + Value *SignLoLHS = + Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs"); + Value *SignLoRHS = + Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs"); + Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS); + Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS); + Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs"); + IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_NE, Or, + ConstantInt::getNullValue(Or->getType())); + } else { + Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS, + ConstantInt::getNullValue(LegalTy)); + Value *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS, + ConstantInt::getNullValue(LegalTy)); + IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs"); + } + Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB); + + // BB overflow.no: + Builder.SetInsertPoint(NoOverflowBB); + Value *ExtLoLHS, *ExtLoRHS; + if (IsSigned) { + ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext"); + ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext"); + } else { + ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext"); + ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext"); + } + + Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no"); + + // In overflow.no BB: we are sure that the overflow flag is false. + // So, if we found this pattern: + // br (extractvalue (%mul, 1)), label %if.then, label %if.end + // then we can jump directly to %if.end as we're sure that there is no + // overflow. + BasicBlock *DetectNoOverflowBrBB = nullptr; + StructType *STy = StructType::get( + I->getContext(), {Ty, IntegerType::getInt1Ty(I->getContext())}); + // Look for the pattern in the users of I, and make sure that all the users + // are either part of the pattern or NOT in the same BB as I. + for (User *U : I->users()) { + if (auto *Instr = dyn_cast(U); + Instr && Instr->getParent() != I->getParent()) + continue; + + if (auto *ExtUser = dyn_cast(U)) { + if (ExtUser->hasOneUse() && ExtUser->getNumIndices() == 1 && + ExtUser->getIndices()[0] == 1) { + if (auto *Br = dyn_cast(*ExtUser->user_begin())) { + DetectNoOverflowBrBB = Br->getSuccessor(1) /*if.end*/; + continue; + } + } + } + // If we come here, it means that either the pattern doesn't exist or + // there are multiple users in the same BB + DetectNoOverflowBrBB = nullptr; + break; + } + if (DetectNoOverflowBrBB) { + // BB overflow.no: jump directly to if.end BB + Builder.CreateBr(DetectNoOverflowBrBB); + + // BB if.end: + Builder.SetInsertPoint(DetectNoOverflowBrBB, + DetectNoOverflowBrBB->getFirstInsertionPt()); + // Create PHI node to get the results of multiplication from 'overflow.no' + // and 'overflow' BBs + PHINode *NoOverflowPHI = Builder.CreatePHI(Ty, 2); + NoOverflowPHI->addIncoming(Mul, NoOverflowBB); + // Create struct value to replace all uses of I + Value *StructValNoOverflow = PoisonValue::get(STy); + StructValNoOverflow = + Builder.CreateInsertValue(StructValNoOverflow, NoOverflowPHI, {0}); + // Overflow flag is always false as we are sure it's not overflow. + StructValNoOverflow = Builder.CreateInsertValue( + StructValNoOverflow, ConstantInt::getFalse(I->getContext()), {1}); + // Replace all uses of I, only uses dominated by the if.end BB + I->replaceUsesOutsideBlock(StructValNoOverflow, I->getParent()); + + // Remove the original BB as it's divided into 'overflow.entry' and + // 'overflow' BBs. + BasicBlock *ToBeRemoveBB = I->getParent(); + // BB overflow: + OverflowBB->splice(OverflowBB->end(), ToBeRemoveBB); + // Extract the multiplication result to add it to the PHI node in the if.end + // BB + Builder.SetInsertPoint(OverflowBB, OverflowBB->end()); + Value *IntrinsicMulRes = Builder.CreateExtractValue(I, {0}, "mul.extract"); + cast(IntrinsicMulRes)->moveAfter(I); + NoOverflowPHI->addIncoming(IntrinsicMulRes, OverflowBB); + + ToBeRemoveBB->eraseFromParent(); + // Restore the original name of the overflow.entry BB: + OverflowEntryBB->setName(KeepBBName); + ModifiedDT = ModifyDT::ModifyBBDT; + return true; + } + + // Otherwise, we need to create the 'overflow.res' BB to merge the results of + // the two paths: + I->getParent()->setName("overflow.res"); + BasicBlock *OverflowResBB = I->getParent(); + + // BB overflow.no: jump to overflow.res BB + Builder.CreateBr(OverflowResBB); + + // BB overflow.res: + Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt()); + PHINode *OverflowResPHI = Builder.CreatePHI(Ty, 2), + *OverflowFlagPHI = + Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2); + + Value *StructValOverflowRes = PoisonValue::get(STy); + StructValOverflowRes = + Builder.CreateInsertValue(StructValOverflowRes, OverflowResPHI, {0}); + StructValOverflowRes = + Builder.CreateInsertValue(StructValOverflowRes, OverflowFlagPHI, {1}); + OverflowResPHI->addIncoming(Mul, NoOverflowBB); + OverflowFlagPHI->addIncoming(ConstantInt::getFalse(I->getContext()), + NoOverflowBB); + + // Before moving the mul.overflow intrinsic to the overflowBB, replace all its + // uses by StructValOverflowRes. + I->replaceAllUsesWith(StructValOverflowRes); + I->removeFromParent(); + + // BB overflow: + I->insertInto(OverflowBB, OverflowBB->end()); + Builder.SetInsertPoint(OverflowBB, OverflowBB->end()); + Value *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow"); + Value *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag"); + Builder.CreateBr(OverflowResBB); + + // Add The Extracted values to the PHINodes in the overflow.res block. + OverflowResPHI->addIncoming(MulOverflow, OverflowBB); + OverflowFlagPHI->addIncoming(OverflowFlag, OverflowBB); + + // Restore the original name of the overflow.entry BB: + OverflowEntryBB->setName(KeepBBName); + + ModifiedDT = ModifyDT::ModifyBBDT; + return true; +} + /// If there are any memory operands, use OptimizeMemoryInst to sink their /// address computing into the block when possible / profitable. bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f5d14905cac66..36938a493e923 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -321,6 +321,10 @@ class AArch64TargetLowering : public TargetLowering { return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); } + // Return true if the target wants to optimize the mul overflow intrinsic + // by detecting if there is no overflow. + bool shouldOptimizeMulOverflowIntrinsic() const override { return true; } + Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll index 9e1c0c1b115ab..12ae241dda4bd 100644 --- a/llvm/test/CodeGen/AArch64/i128-math.ll +++ b/llvm/test/CodeGen/AArch64/i128-math.ll @@ -262,20 +262,28 @@ define i128 @u128_mul(i128 %x, i128 %y) { define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_checked_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB17_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w8, w8, wzr, lo ; CHECK-NEXT: eor w2, w8, #0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB17_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: eor w2, w8, #0x1 ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -290,19 +298,27 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) { define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_overflowing_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB18_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w2, w8, wzr, lo +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB18_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -316,19 +332,28 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) { define i128 @u128_saturating_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_saturating_mul: ; CHECK: // %bb.0: -; CHECK-NEXT: mul x9, x3, x0 +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB19_2 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: mul x8, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 -; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x9, x3, x0 +; CHECK-NEXT: madd x11, x1, x2, x8 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x12, x0, x2 +; CHECK-NEXT: ccmp xzr, x9, #0, eq ; CHECK-NEXT: mul x8, x0, x2 ; CHECK-NEXT: cset w10, ne -; CHECK-NEXT: adds x9, x11, x9 +; CHECK-NEXT: adds x9, x12, x11 ; CHECK-NEXT: csinc w10, w10, wzr, lo +; CHECK-NEXT: b .LBB19_3 +; CHECK-NEXT: .LBB19_2: // %overflow.no +; CHECK-NEXT: umulh x9, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x8, x0, x2 +; CHECK-NEXT: .LBB19_3: // %overflow.res ; CHECK-NEXT: cmp w10, #0 ; CHECK-NEXT: csinv x0, x8, xzr, eq ; CHECK-NEXT: csinv x1, x9, xzr, eq @@ -355,6 +380,11 @@ define i128 @i128_mul(i128 %x, i128 %y) { define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_checked_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB21_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -364,24 +394,30 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x9, x8, x9 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x8, x14, x10 -; CHECK-NEXT: mul x15, x1, x3 -; CHECK-NEXT: smulh x10, x1, x3 -; CHECK-NEXT: mov x1, x8 -; CHECK-NEXT: adc x11, x12, x13 -; CHECK-NEXT: asr x12, x9, #63 -; CHECK-NEXT: asr x13, x11, #63 -; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: adc x12, x12, x13 -; CHECK-NEXT: adds x9, x15, x9 -; CHECK-NEXT: adc x10, x10, x12 -; CHECK-NEXT: cmp x9, x11 -; CHECK-NEXT: ccmp x10, x11, #0, eq -; CHECK-NEXT: cset w2, eq +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: eor w2, w8, #0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB21_2: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: eor w2, w8, #0x1 ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -396,6 +432,11 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_overflowing_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB22_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -405,24 +446,29 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x9, x8, x9 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x8, x14, x10 -; CHECK-NEXT: mul x15, x1, x3 -; CHECK-NEXT: smulh x10, x1, x3 -; CHECK-NEXT: mov x1, x8 -; CHECK-NEXT: adc x11, x12, x13 -; CHECK-NEXT: asr x12, x9, #63 -; CHECK-NEXT: asr x13, x11, #63 -; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: adc x12, x12, x13 -; CHECK-NEXT: adds x9, x15, x9 -; CHECK-NEXT: adc x10, x10, x12 -; CHECK-NEXT: cmp x9, x11 -; CHECK-NEXT: ccmp x10, x11, #0, eq +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq ; CHECK-NEXT: cset w2, ne +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB22_2: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: mov w2, wzr ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -436,6 +482,11 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { define i128 @i128_saturating_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_saturating_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB23_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -445,29 +496,35 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: adc x9, x8, x9 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x9, x14, x10 -; CHECK-NEXT: mul x11, x1, x3 -; CHECK-NEXT: adc x10, x12, x13 -; CHECK-NEXT: smulh x12, x1, x3 -; CHECK-NEXT: asr x13, x8, #63 -; CHECK-NEXT: asr x14, x10, #63 -; CHECK-NEXT: adds x8, x8, x10 -; CHECK-NEXT: adc x10, x13, x14 -; CHECK-NEXT: adds x8, x11, x8 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: mul x13, x0, x2 -; CHECK-NEXT: adc x10, x12, x10 -; CHECK-NEXT: eor x12, x3, x1 -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: eor x10, x10, x11 -; CHECK-NEXT: asr x11, x12, #63 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: eor x10, x11, #0x7fffffffffffffff -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csinv x0, x13, x11, eq -; CHECK-NEXT: csel x1, x10, x9, ne +; CHECK-NEXT: adds x8, x14, x10 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: asr x14, x8, #63 +; CHECK-NEXT: smulh x10, x1, x3 +; CHECK-NEXT: adc x11, x12, x13 +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: asr x13, x11, #63 +; CHECK-NEXT: adds x11, x9, x11 +; CHECK-NEXT: mul x9, x0, x2 +; CHECK-NEXT: adc x12, x12, x13 +; CHECK-NEXT: adds x11, x15, x11 +; CHECK-NEXT: adc x10, x10, x12 +; CHECK-NEXT: cmp x11, x14 +; CHECK-NEXT: ccmp x10, x14, #0, eq +; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: b .LBB23_3 +; CHECK-NEXT: .LBB23_2: // %overflow.no +; CHECK-NEXT: smulh x8, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x9, x0, x2 +; CHECK-NEXT: .LBB23_3: // %overflow.res +; CHECK-NEXT: eor x11, x3, x1 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: asr x11, x11, #63 +; CHECK-NEXT: eor x12, x11, #0x7fffffffffffffff +; CHECK-NEXT: csinv x0, x9, x11, eq +; CHECK-NEXT: csel x1, x12, x8, ne ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll index 9924b7c63f763..12c1b05dd41e0 100644 --- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll +++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll @@ -224,21 +224,24 @@ cleanup: define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-LABEL: test_umul_i128: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB4_3 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w8, w8, wzr, lo ; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: b.ne .LBB4_2 -; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: b.ne .LBB4_4 +; CHECK-NEXT: // %bb.2: // %if.then ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 @@ -248,7 +251,9 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-NEXT: asr x1, x0, #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB4_2: // %if.end +; CHECK-NEXT: .LBB4_3: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: .LBB4_4: ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: ret entry: @@ -273,34 +278,38 @@ cleanup: define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-LABEL: test_smul_i128: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: asr x10, x1, #63 -; CHECK-NEXT: umulh x11, x0, x2 -; CHECK-NEXT: asr x14, x3, #63 -; CHECK-NEXT: mov x8, x1 -; CHECK-NEXT: mul x12, x1, x2 -; CHECK-NEXT: umulh x9, x1, x2 -; CHECK-NEXT: mul x10, x10, x2 -; CHECK-NEXT: adds x11, x12, x11 -; CHECK-NEXT: mul x15, x0, x3 -; CHECK-NEXT: umulh x13, x0, x3 -; CHECK-NEXT: adc x9, x9, x10 -; CHECK-NEXT: mul x14, x0, x14 -; CHECK-NEXT: mul x16, x1, x3 -; CHECK-NEXT: adds x1, x15, x11 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: smulh x8, x8, x3 -; CHECK-NEXT: adc x10, x13, x14 +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB5_3 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: asr x9, x1, #63 +; CHECK-NEXT: umulh x10, x0, x2 +; CHECK-NEXT: asr x13, x3, #63 +; CHECK-NEXT: mul x11, x1, x2 +; CHECK-NEXT: umulh x8, x1, x2 +; CHECK-NEXT: mul x9, x9, x2 +; CHECK-NEXT: adds x10, x11, x10 +; CHECK-NEXT: mul x14, x0, x3 +; CHECK-NEXT: umulh x12, x0, x3 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 +; CHECK-NEXT: mul x13, x0, x13 +; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 ; CHECK-NEXT: asr x12, x10, #63 -; CHECK-NEXT: adds x9, x9, x10 +; CHECK-NEXT: adds x8, x8, x10 ; CHECK-NEXT: adc x10, x11, x12 -; CHECK-NEXT: adds x9, x16, x9 +; CHECK-NEXT: adds x8, x15, x8 ; CHECK-NEXT: asr x11, x1, #63 -; CHECK-NEXT: adc x8, x8, x10 -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: eor x9, x9, x11 -; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cbz x8, .LBB5_2 -; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: adc x9, x9, x10 +; CHECK-NEXT: cmp x9, x11 +; CHECK-NEXT: ccmp x8, x11, #0, eq +; CHECK-NEXT: b.eq .LBB5_4 +; CHECK-NEXT: // %bb.2: // %if.then ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 @@ -310,7 +319,9 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-NEXT: asr x1, x0, #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_2: // %if.end +; CHECK-NEXT: .LBB5_3: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: .LBB5_4: ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll index edfd80b4f2706..ace0c83e63c7c 100644 --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -4,20 +4,28 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; AARCH-LABEL: muloti_test: ; AARCH: // %bb.0: // %start +; AARCH-NEXT: orr x8, x1, x3 +; AARCH-NEXT: cbz x8, .LBB0_2 +; AARCH-NEXT: // %bb.1: // %overflow ; AARCH-NEXT: mul x9, x3, x0 ; AARCH-NEXT: cmp x1, #0 ; AARCH-NEXT: ccmp x3, #0, #4, ne -; AARCH-NEXT: umulh x8, x1, x2 -; AARCH-NEXT: umulh x10, x3, x0 +; AARCH-NEXT: umulh x10, x1, x2 +; AARCH-NEXT: umulh x8, x3, x0 ; AARCH-NEXT: madd x9, x1, x2, x9 -; AARCH-NEXT: ccmp xzr, x8, #0, eq -; AARCH-NEXT: umulh x11, x0, x2 ; AARCH-NEXT: ccmp xzr, x10, #0, eq +; AARCH-NEXT: umulh x11, x0, x2 +; AARCH-NEXT: ccmp xzr, x8, #0, eq ; AARCH-NEXT: mul x0, x0, x2 ; AARCH-NEXT: cset w8, ne ; AARCH-NEXT: adds x1, x11, x9 ; AARCH-NEXT: csinc w2, w8, wzr, lo ; AARCH-NEXT: ret +; AARCH-NEXT: .LBB0_2: // %overflow.no +; AARCH-NEXT: umulh x1, x0, x2 +; AARCH-NEXT: mul x0, x0, x2 +; AARCH-NEXT: mov w2, wzr +; AARCH-NEXT: ret start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 %1 = extractvalue { i128, i1 } %0, 0 @@ -35,45 +43,56 @@ start: define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 { ; AARCH-LABEL: __muloti4: ; AARCH: // %bb.0: // %Entry -; AARCH-NEXT: asr x11, x1, #63 -; AARCH-NEXT: asr x9, x3, #63 -; AARCH-NEXT: umulh x12, x0, x2 -; AARCH-NEXT: mov x8, x1 +; AARCH-NEXT: eor x8, x3, x2, asr #63 +; AARCH-NEXT: eor x9, x1, x0, asr #63 ; AARCH-NEXT: str wzr, [x4] -; AARCH-NEXT: mul x13, x1, x2 -; AARCH-NEXT: umulh x10, x1, x2 -; AARCH-NEXT: mul x11, x11, x2 -; AARCH-NEXT: adds x12, x13, x12 -; AARCH-NEXT: mul x15, x0, x3 -; AARCH-NEXT: umulh x14, x0, x3 -; AARCH-NEXT: adc x10, x10, x11 -; AARCH-NEXT: mul x9, x0, x9 -; AARCH-NEXT: mul x16, x1, x3 -; AARCH-NEXT: adds x1, x15, x12 -; AARCH-NEXT: asr x12, x10, #63 -; AARCH-NEXT: smulh x11, x8, x3 -; AARCH-NEXT: adc x9, x14, x9 -; AARCH-NEXT: asr x13, x9, #63 -; AARCH-NEXT: adds x9, x10, x9 -; AARCH-NEXT: asr x10, x1, #63 +; AARCH-NEXT: orr x8, x9, x8 +; AARCH-NEXT: cbz x8, .LBB1_2 +; AARCH-NEXT: // %bb.1: // %overflow +; AARCH-NEXT: asr x9, x1, #63 +; AARCH-NEXT: umulh x10, x0, x2 +; AARCH-NEXT: asr x13, x3, #63 +; AARCH-NEXT: mul x11, x1, x2 +; AARCH-NEXT: umulh x8, x1, x2 +; AARCH-NEXT: mul x9, x9, x2 +; AARCH-NEXT: adds x10, x11, x10 +; AARCH-NEXT: mul x14, x0, x3 +; AARCH-NEXT: umulh x12, x0, x3 +; AARCH-NEXT: adc x9, x8, x9 +; AARCH-NEXT: mul x13, x0, x13 +; AARCH-NEXT: adds x8, x14, x10 +; AARCH-NEXT: mul x15, x1, x3 +; AARCH-NEXT: smulh x10, x1, x3 +; AARCH-NEXT: adc x11, x12, x13 +; AARCH-NEXT: asr x12, x9, #63 +; AARCH-NEXT: asr x13, x11, #63 +; AARCH-NEXT: adds x9, x9, x11 +; AARCH-NEXT: asr x11, x8, #63 ; AARCH-NEXT: mul x0, x0, x2 ; AARCH-NEXT: adc x12, x12, x13 -; AARCH-NEXT: adds x9, x16, x9 -; AARCH-NEXT: adc x11, x11, x12 -; AARCH-NEXT: cmp x9, x10 -; AARCH-NEXT: ccmp x11, x10, #0, eq +; AARCH-NEXT: adds x9, x15, x9 +; AARCH-NEXT: adc x10, x10, x12 +; AARCH-NEXT: cmp x9, x11 +; AARCH-NEXT: ccmp x10, x11, #0, eq ; AARCH-NEXT: cset w9, ne -; AARCH-NEXT: tbz x8, #63, .LBB1_2 -; AARCH-NEXT: // %bb.1: // %Entry -; AARCH-NEXT: eor x8, x3, #0x8000000000000000 -; AARCH-NEXT: orr x8, x2, x8 -; AARCH-NEXT: cbz x8, .LBB1_3 -; AARCH-NEXT: .LBB1_2: // %Else2 -; AARCH-NEXT: cbz w9, .LBB1_4 -; AARCH-NEXT: .LBB1_3: // %Then7 -; AARCH-NEXT: mov w8, #1 // =0x1 -; AARCH-NEXT: str w8, [x4] -; AARCH-NEXT: .LBB1_4: // %Block9 +; AARCH-NEXT: tbnz x1, #63, .LBB1_3 +; AARCH-NEXT: b .LBB1_4 +; AARCH-NEXT: .LBB1_2: // %overflow.no +; AARCH-NEXT: smulh x8, x0, x2 +; AARCH-NEXT: mov w9, wzr +; AARCH-NEXT: mul x0, x0, x2 +; AARCH-NEXT: tbz x1, #63, .LBB1_4 +; AARCH-NEXT: .LBB1_3: // %overflow.res +; AARCH-NEXT: eor x10, x3, #0x8000000000000000 +; AARCH-NEXT: orr x10, x2, x10 +; AARCH-NEXT: cbz x10, .LBB1_5 +; AARCH-NEXT: .LBB1_4: // %Else2 +; AARCH-NEXT: cbz w9, .LBB1_6 +; AARCH-NEXT: .LBB1_5: // %Then7 +; AARCH-NEXT: mov w9, #1 // =0x1 +; AARCH-NEXT: str w9, [x4] +; AARCH-NEXT: .LBB1_6: // %Block9 +; AARCH-NEXT: mov x1, x8 ; AARCH-NEXT: ret Entry: store i32 0, ptr %2, align 4