diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 8e74b8645fad9..b5322fddf7d3f 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Transforms/Utils/Local.h" #include +#include #include #include @@ -60,6 +61,8 @@ STATISTIC(NumUDivURemsNarrowed, "Number of udivs/urems whose width was decreased"); STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr"); STATISTIC(NumAShrsRemoved, "Number of ashr removed"); +STATISTIC(NumLShrsRemoved, "Number of lshr removed"); +STATISTIC(NumLShrsNarrowed, "Number of lshrs whose width was decreased"); STATISTIC(NumSRems, "Number of srem converted to urem"); STATISTIC(NumSExt, "Number of sext converted to zext"); STATISTIC(NumSIToFP, "Number of sitofp converted to uitofp"); @@ -93,6 +96,10 @@ STATISTIC(NumUDivURemsNarrowedExpanded, "Number of bound udiv's/urem's expanded"); STATISTIC(NumNNeg, "Number of zext/uitofp non-negative deductions"); +static cl::opt + NarrowLShr("correlated-propagation-narrow-lshr", cl::init(true), cl::Hidden, + cl::desc("Enable narrowing of LShr instructions.")); + static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { if (Constant *C = LVI->getConstant(V, At)) return C; @@ -751,6 +758,15 @@ static Domain getDomain(const ConstantRange &CR) { return Domain::Unknown; } +/// Returns the optimal bit width for the narrowed type. +/// If desiredBitWidth is greater that 32 returns max int and thus effectively +/// turns of the narrowing. For desiredBitWidth <= 32 returns i8, i16, i32. +static unsigned getNarrowedWidth(unsigned desiredBitWidth) { + return desiredBitWidth <= 32 + ? std::max(PowerOf2Ceil(desiredBitWidth), 8) + : std::numeric_limits::max(); +} + /// Try to shrink a sdiv/srem's width down to the smallest power of two that's /// sufficient to contain its operands. static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR, @@ -774,7 +790,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR, ++MinSignedBits; // Don't shrink below 8 bits wide. - unsigned NewWidth = std::max(PowerOf2Ceil(MinSignedBits), 8); + unsigned NewWidth = getNarrowedWidth(MinSignedBits); // NewWidth might be greater than OrigWidth if OrigWidth is not a power of // two. @@ -892,7 +908,7 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR, // of both of the operands? unsigned MaxActiveBits = std::max(XCR.getActiveBits(), YCR.getActiveBits()); // Don't shrink below 8 bits wide. - unsigned NewWidth = std::max(PowerOf2Ceil(MaxActiveBits), 8); + unsigned NewWidth = getNarrowedWidth(MaxActiveBits); // NewWidth might be greater than OrigWidth if OrigWidth is not a power of // two. @@ -1067,6 +1083,136 @@ static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) { return narrowSDivOrSRem(Instr, LCR, RCR); } +/** + * @brief Narrows type of the LShr instruction if the range of the possible + * values fits into a smaller type. Since LShr is a relatively cheap + * instruction, the narrowing should not happen too frequently. Performance + * testing and compatibility with other passes indicate that the narrowing is + * beneficial under the following circumstances: + * + * i) the narrowing occurs only if all the users of the LShr instruction are + * already TruncInst; + * + * ii) the narrowing is carried out to the largest TruncInst following the LShr + * instruction. + * + * Additionally, the function optimizes the cases where the result of the LShr + * instruction is guaranteed to vanish or be equal to poison. + */ +static bool narrowLShr(BinaryOperator *LShr, LazyValueInfo *LVI) { + + IntegerType *RetTy = dyn_cast(LShr->getType()); + if (!RetTy) + return false; + + ConstantRange ArgRange = LVI->getConstantRangeAtUse(LShr->getOperandUse(0), + /*UndefAllowed*/ false); + ConstantRange ShiftRange = LVI->getConstantRangeAtUse(LShr->getOperandUse(1), + /*UndefAllowed*/ false); + + unsigned OrigWidth = RetTy->getScalarSizeInBits(); + unsigned MaxActiveBitsInArg = ArgRange.getActiveBits(); + unsigned MinShiftValue = ShiftRange.getUnsignedMin().getActiveBits() <= 32 + ? static_cast(ShiftRange.getUnsignedMin().getZExtValue()) + : std::numeric_limits::max(); + + // First we deal with the cases where the result is guaranteed to vanish or be + // equal to posion. + + auto replaceWith = [&](Value *V) -> void { + LShr->replaceAllUsesWith(V); + LShr->eraseFromParent(); + ++NumLShrsRemoved; + }; + + // If the shift is larger or equal to the bit width of the argument, + // the instruction returns a poison value. + if (MinShiftValue >= OrigWidth) { + replaceWith(PoisonValue::get(RetTy)); + return true; + } + + // If we are guaranteed to shift away all bits, + // we replace the shift by the null value. + // We should not apply the optimization if LShr is exact, + // as the result may be poison. + if (!LShr->isExact() && MinShiftValue >= MaxActiveBitsInArg) { + replaceWith(Constant::getNullValue(RetTy)); + return true; + } + + // Since LShr returns poison if the shift is larger of equal that the bit + // width of the argument, we must make sure that the maximal possible value + // for the shift is larger than the new width after narrowing. Otherwise some + // shifts that originally vanish would result in poison after the narrowing. + unsigned MaxShiftValue = ShiftRange.getUnsignedMax().getActiveBits() <= 32 + ? static_cast(ShiftRange.getUnsignedMax().getZExtValue()) + : std::numeric_limits::max(); + if (OrigWidth <= MaxShiftValue) + return false; + + // That's how many bits we need. + unsigned MaxActiveBits = + std::max(MaxActiveBitsInArg, ShiftRange.getActiveBits()); + + // We could do better, but is it worth it? + // With the first argument being the n-bit integer, we may limit the value of + // the second argument to be less than n, as larger shifts would lead to a + // vanishing result or poison. Thus the number of bits in the second argument + // is limited by Log2(n). Unfortunately, this would require an introduction of + // a select instruction (or llvm.min) to make sure every argument larger than + // n is mapped to n and not just truncated. We do not implement it here. + + // What is the smallest bit width that can accommodate the entire value ranges + // of both of the operands? Don't shrink below 8 bits wide. + unsigned NewWidth = getNarrowedWidth(MaxActiveBits); + + // NewWidth might be greater than OrigWidth if OrigWidth is not a power of + // two. + if (NewWidth >= OrigWidth) + return false; + + // This is the time to check if all the users are TruncInst + // and to figure out what the largest user is. + for (User *user : LShr->users()) { + if (TruncInst *TI = dyn_cast(user)) { + NewWidth = std::max(NewWidth, TI->getDestTy()->getScalarSizeInBits()); + } else { + return false; + } + } + + // See comment above MaxShiftValue. + if (NewWidth <= MaxShiftValue) + return false; + + // We are ready to truncate. + IRBuilder<> B(LShr); + Type *TruncTy = RetTy->getWithNewBitWidth(NewWidth); + Value *ArgTrunc = B.CreateTruncOrBitCast(LShr->getOperand(0), TruncTy, + LShr->getName() + ".arg.trunc"); + Value *ShiftTrunc = B.CreateTruncOrBitCast(LShr->getOperand(1), TruncTy, + LShr->getName() + ".shift.trunc"); + Value *LShrTrunc = + B.CreateBinOp(LShr->getOpcode(), ArgTrunc, ShiftTrunc, LShr->getName()); + Value *Zext = B.CreateZExt(LShrTrunc, RetTy, LShr->getName() + ".zext"); + + // Should always cast, but better safe than sorry. + if (BinaryOperator *LShrTruncBO = dyn_cast(LShrTrunc)) { + LShrTruncBO->setDebugLoc(LShr->getDebugLoc()); + LShrTruncBO->setIsExact(LShr->isExact()); + } + LShr->replaceAllUsesWith(Zext); + LShr->eraseFromParent(); + + ++NumLShrsNarrowed; + return true; +} + +static bool processLShr(BinaryOperator *SDI, LazyValueInfo *LVI) { + return NarrowLShr ? narrowLShr(SDI, LVI) : false; +} + static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { ConstantRange LRange = LVI->getConstantRangeAtUse(SDI->getOperandUse(0), /*UndefAllowed*/ false); @@ -1093,6 +1239,11 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { SDI->replaceAllUsesWith(BO); SDI->eraseFromParent(); + // Check if the new LShr can be narrowed. + if (NarrowLShr) { + narrowLShr(BO, LVI); + } + return true; } @@ -1254,6 +1405,9 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, case Instruction::AShr: BBChanged |= processAShr(cast(&II), LVI); break; + case Instruction::LShr: + BBChanged |= processLShr(cast(&II), LVI); + break; case Instruction::SExt: BBChanged |= processSExt(cast(&II), LVI); break; diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/lshr-plus-instcombine.ll b/llvm/test/Transforms/CorrelatedValuePropagation/lshr-plus-instcombine.ll new file mode 100644 index 0000000000000..0599d8b9c02fe --- /dev/null +++ b/llvm/test/Transforms/CorrelatedValuePropagation/lshr-plus-instcombine.ll @@ -0,0 +1,115 @@ +; RUN: opt < %s -passes="correlated-propagation,instcombine" -S | FileCheck %s + +; The tests below are the same as in lshr.ll +; Here we test whether the CorrelatedValuePropagation pass +; composed with InstCombinePass produces the expected optimizations. + +; CHECK-LABEL: @trunc_test1 +; CHECK-NEXT: [[A1:%.*]] = lshr i32 [[A:%.*]], 16 +; CHECK-NEXT: [[CARG:%.*]] = trunc nuw i32 [[A1]] to i16 +; CHECK-NEXT: [[B1:%.*]] = trunc i32 [[B:%.*]] to i16 +; CHECK-NEXT: [[CSHIFT:%.*]] = and i16 [[B1]], 15 +; CHECK-NEXT: [[C1:%.*]] = lshr i16 [[CARG]], [[CSHIFT]] +; CHECK-NEXT: ret i16 [[C1]] + +define i16 @trunc_test1(i32 %a, i32 %b) { + %a.eff.trunc = lshr i32 %a, 16 + %b.eff.trunc = and i32 %b, 15 + %c = lshr i32 %a.eff.trunc, %b.eff.trunc + %c.trunc = trunc i32 %c to i16 + ret i16 %c.trunc +} + +; CHECK-LABEL: @trunc_test2 +; CHECK-NEXT: [[C1:%.*]] = lshr i16 [[A:%.*]], 2 +; CHECK-NEXT: ret i16 [[C1]] + +define i16 @trunc_test2(i16 %a) { + %a.ext = zext i16 %a to i32 + %c = lshr i32 %a.ext, 2 + %c.trunc = trunc i32 %c to i16 + ret i16 %c.trunc +} + +; CHECK-LABEL: @trunc_test3 +; CHECK-NEXT: [[B:%.*]] = lshr i16 [[A:%.*]], 2 +; CHECK-NEXT: [[C:%.*]] = add nuw nsw i16 [[B]], 123 +; CHECK-NEXT: ret i16 [[C]] + +define i16 @trunc_test3(i16 %a) { + %a.ext = zext i16 %a to i32 + %b = lshr i32 %a.ext, 2 + %c = add i32 %b, 123 + %c.trunc = trunc i32 %c to i16 + ret i16 %c.trunc +} + +; CHECK-LABEL: @trunc_test4 +; CHECK-NEXT: [[A1:%.*]] = udiv i32 [[A:%.*]], 17000000 +; CHECK-NEXT: [[B:%.*]] = trunc nuw nsw i32 [[A1]] to i16 +; CHECK-NEXT: [[B1:%.*]] = lshr i16 [[B]], 2 +; CHECK-NEXT: ret i16 [[B1]] + +define i16 @trunc_test4(i32 %a) { + %a.eff.trunc = udiv i32 %a, 17000000 ; larger than 2^24 + %b = lshr i32 %a.eff.trunc, 2 + %b.trunc.1 = trunc i32 %b to i16 + %b.trunc.2 = trunc i32 %b to i8 + ret i16 %b.trunc.1 +} + +; CHECK-LABEL: @trunc_test5 +; CHECK-NEXT: [[A1:%.*]] = udiv i32 [[A:%.*]], 17000000 +; CHECK-NEXT: [[B:%.*]] = lshr i32 [[A1]], 2 +; CHECK-NEXT: [[C:%.*]] = add nuw nsw i32 [[B]], 123 +; CHECK-NEXT: ret i32 [[C]] + +define i32 @trunc_test5(i32 %a) { + %a.eff.trunc = udiv i32 %a, 17000000 ; larger than 2^24 + %b = lshr i32 %a.eff.trunc, 2 + %b.trunc.1 = trunc i32 %b to i16 + %b.trunc.2 = trunc i32 %b to i8 + %c = add i32 %b, 123 + ret i32 %c +} + +; CHECK-LABEL: @zero_test1 +; CHECK-NEXT: ret i32 poison + +define i32 @zero_test1(i32 %a) { + %b = lshr i32 %a, 32 + %c = add i32 %b, 123 + ret i32 %c +} + +; CHECK-LABEL: @zero_test2 +; CHECK-NEXT: ret i32 poison + +define i32 @zero_test2(i32 %a, i32 %b) { + %b.large = add nuw nsw i32 %b, 50 + %c = lshr i32 %a, %b.large + %d = add i32 %c, 123 + ret i32 %d +} + +; CHECK-LABEL: @zero_test3 +; CHECK-NEXT: ret i32 123 + +define i32 @zero_test3(i32 %a, i32 %b) { + %a.small = lshr i32 %a, 16 + %b.large = add nuw nsw i32 %b, 20 + %c = lshr i32 %a.small, %b.large + %d = add i32 %c, 123 + ret i32 %d +} + +; CHECK-LABEL: @zero_test4 +; CHECK-NEXT: ret i32 123 + +define i32 @zero_test4(i32 %a, i32 %b) { + %a.small = lshr i32 %a, 16 + %b.large = add nuw nsw i32 %b, 20 + %c = lshr exact i32 %a.small, %b.large + %d = add i32 %c, 123 + ret i32 %d +} diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/lshr.ll b/llvm/test/Transforms/CorrelatedValuePropagation/lshr.ll new file mode 100644 index 0000000000000..7f1e866874f3f --- /dev/null +++ b/llvm/test/Transforms/CorrelatedValuePropagation/lshr.ll @@ -0,0 +1,266 @@ +; RUN: opt < %s -passes=correlated-propagation -S | FileCheck %s + +; Tests: test_nop and tests 1 through 6 are taken from udiv.ll +; with udiv replaced by lshr (plus some tweaks). +; In those tests the lshr instruction has no users. + +; CHECK-LABEL: @test_nop +define void @test_nop(i32 %n) { +; CHECK: lshr i32 %n, 2 + %shr = lshr i32 %n, 2 + ret void +} + +; CHECK-LABEL: @test1( +define void @test1(i32 %n) { +entry: + %cmp = icmp ule i32 %n, 65535 + br i1 %cmp, label %bb, label %exit + +bb: +; CHECK: lshr i16 + %shr = lshr i32 %n, 2 + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: @test2( +define void @test2(i32 %n) { +entry: + %cmp = icmp ule i32 %n, 65536 + br i1 %cmp, label %bb, label %exit + +bb: +; CHECK: lshr i32 + %shr = lshr i32 %n, 2 + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: @test3( +define void @test3(i32 %m, i32 %n) { +entry: + %cmp1 = icmp ult i32 %m, 65535 + %cmp2 = icmp ult i32 %n, 65535 + %cmp = and i1 %cmp1, %cmp2 + br i1 %cmp, label %bb, label %exit + +bb: +; CHECK: lshr i32 + %shr = lshr i32 %m, %n + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: @test3a( +define void @test3a(i32 %m, i32 %n) { +entry: + %cmp1 = icmp ult i32 %m, 65535 + %cmp2 = icmp ult i32 %n, 17 + %cmp = and i1 %cmp1, %cmp2 + br i1 %cmp, label %bb, label %exit + +bb: +; CHECK: lshr i32 + %shr = lshr i32 %m, %n + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: @test3b( +define void @test3b(i32 %m, i32 %n) { +entry: + %cmp1 = icmp ult i32 %m, 65535 + %cmp2 = icmp ult i32 %n, 16 + %cmp = and i1 %cmp1, %cmp2 + br i1 %cmp, label %bb, label %exit + +bb: +; CHECK: lshr i16 + %shr = lshr i32 %m, %n + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: @test5 +define void @test5(i32 %n) { + %trunc = and i32 %n, 65535 + ; CHECK: lshr i16 + %shr = lshr i32 %trunc, 2 + ret void +} + +; CHECK-LABEL: @test5a +define void @test5a(i32 %n) { + %trunc = and i32 %n, 65535 + ; CHECK: lshr i16 + %shr = lshr i32 %trunc, 15 + ret void +} + +; CHECK-LABEL: @test6 +define void @test6(i32 %n) { +entry: + %cmp = icmp ule i32 %n, 255 + br i1 %cmp, label %bb, label %exit + +bb: +; CHECK: lshr i8 + %shr = lshr i32 %n, 2 + br label %exit + +exit: + ret void +} + +; The tests below check whether the narrowing occures only if the appropriate +; `trunc` instructions follow. +; +; Just as in udiv.ll, additional zext and trunc instructions appear. +; They are eventually recombined by InstCombinePass +; that follows in the pipeline. + +; CHECK-LABEL: @trunc_test1 +; CHECK-NEXT: [[A1:%.*]] = lshr i32 [[A:%.*]], 16 +; CHECK-NEXT: [[B1:%.*]] = and i32 [[B:%.*]], 15 +; CHECK-NEXT: [[A2:%.*]] = trunc i32 [[A1]] to i16 +; CHECK-NEXT: [[B2:%.*]] = trunc i32 [[B1]] to i16 +; CHECK-NEXT: [[C1:%.*]] = lshr i16 [[A2]], [[B2]] +; CHECK-NEXT: [[C2:%.*]] = zext i16 [[C1]] to i32 +; CHECK-NEXT: [[C3:%.*]] = trunc i32 [[C2]] to i16 +; CHECK-NEXT: ret i16 [[C3]] + +define i16 @trunc_test1(i32 %a, i32 %b) { + %a.eff.trunc = lshr i32 %a, 16 + %b.eff.trunc = and i32 %b, 15 + %c = lshr i32 %a.eff.trunc, %b.eff.trunc + %c.trunc = trunc i32 %c to i16 + ret i16 %c.trunc +} + +; CHECK-LABEL: @trunc_test2 +; CHECK-NEXT: [[A1:%.*]] = zext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[A2:%.*]] = trunc i32 [[A1]] to i16 +; CHECK-NEXT: [[C1:%.*]] = lshr i16 [[A2]], 2 +; CHECK-NEXT: [[C2:%.*]] = zext i16 [[C1]] to i32 +; CHECK-NEXT: [[C3:%.*]] = trunc i32 [[C2]] to i16 +; CHECK-NEXT: ret i16 [[C3]] + +define i16 @trunc_test2(i16 %a) { + %a.ext = zext i16 %a to i32 + %c = lshr i32 %a.ext, 2 + %c.trunc = trunc i32 %c to i16 + ret i16 %c.trunc +} + +; CHECK-LABEL: @trunc_test3 +; CHECK-NEXT: [[A1:%.*]] = zext i16 [[A:%.*]] to i32 +; CHECK-NEXT: [[B:%.*]] = lshr i32 [[A1]], 2 +; CHECK-NEXT: [[C0:%.*]] = add nuw nsw i32 [[B]], 123 +; CHECK-NEXT: [[C1:%.*]] = trunc i32 [[C0]] to i16 +; CHECK-NEXT: ret i16 [[C1]] + +define i16 @trunc_test3(i16 %a) { + %a.ext = zext i16 %a to i32 + %b = lshr i32 %a.ext, 2 + %c = add i32 %b, 123 + %c.trunc = trunc i32 %c to i16 + ret i16 %c.trunc +} + +; CHECK-LABEL: @trunc_test4 +; CHECK-NEXT: [[A1:%.*]] = udiv i32 [[A:%.*]], 17000000 +; CHECK-NEXT: [[B0:%.*]] = trunc i32 [[A1]] to i16 +; CHECK-NEXT: [[B1:%.*]] = lshr i16 [[B0]], 2 +; CHECK-NEXT: [[B2:%.*]] = zext i16 [[B1]] to i32 +; CHECK-NEXT: [[C1:%.*]] = trunc i32 [[B2]] to i16 +; CHECK-NEXT: [[C2:%.*]] = trunc i32 [[B2]] to i8 +; CHECK-NEXT: ret i16 [[C1]] + +define i16 @trunc_test4(i32 %a) { + %a.eff.trunc = udiv i32 %a, 17000000 ; larger than 2^24 + %b = lshr i32 %a.eff.trunc, 2 + %b.trunc.1 = trunc i32 %b to i16 + %b.trunc.2 = trunc i32 %b to i8 + ret i16 %b.trunc.1 +} + +; CHECK-LABEL: @trunc_test5 +; CHECK-NEXT: [[A1:%.*]] = udiv i32 [[A:%.*]], 17000000 +; CHECK-NEXT: [[B:%.*]] = lshr i32 [[A1]], 2 +; CHECK-NEXT: [[B1:%.*]] = trunc i32 [[B]] to i16 +; CHECK-NEXT: [[B2:%.*]] = trunc i32 [[B]] to i8 +; CHECK-NEXT: [[C:%.*]] = add nuw nsw i32 [[B]], 123 +; CHECK-NEXT: ret i32 [[C]] + +define i32 @trunc_test5(i32 %a) { + %a.eff.trunc = udiv i32 %a, 17000000 ; larger than 2^24 + %b = lshr i32 %a.eff.trunc, 2 + %b.trunc.1 = trunc i32 %b to i16 + %b.trunc.2 = trunc i32 %b to i8 + %c = add i32 %b, 123 + ret i32 %c +} + +; Test cases where lshr simplifies to zero or poison. + +; CHECK-LABEL: @zero_test1 +; CHECK-NEXT: [[C:%.*]] = add i32 poison, 123 +; CHECK-NEXT: ret i32 [[C]] + +define i32 @zero_test1(i32 %a) { + %b = lshr i32 %a, 32 + %c = add i32 %b, 123 + ret i32 %c +} + +; CHECK-LABEL: @zero_test2 +; CHECK-NEXT: [[B1:%.*]] = add nuw nsw i32 [[B:%.*]], 50 +; CHECK-NEXT: [[D:%.*]] = add i32 poison, 123 +; CHECK-NEXT: ret i32 [[D]] + +define i32 @zero_test2(i32 %a, i32 %b) { + %b.large = add nuw nsw i32 %b, 50 + %c = lshr i32 %a, %b.large + %d = add i32 %c, 123 + ret i32 %d +} + +; CHECK-LABEL: @zero_test3 +; CHECK-NEXT: [[A1:%.*]] = lshr i32 [[A:%.*]], 16 +; CHECK-NEXT: [[B1:%.*]] = add nuw nsw i32 [[B:%.*]], 20 +; CHECK-NEXT: [[D:%.*]] = add nuw nsw i32 0, 123 +; CHECK-NEXT: ret i32 123 + +define i32 @zero_test3(i32 %a, i32 %b) { + %a.small = lshr i32 %a, 16 + %b.large = add nuw nsw i32 %b, 20 + %c = lshr i32 %a.small, %b.large + %d = add i32 %c, 123 + ret i32 %d +} + +; CHECK-LABEL: @zero_test4 +; CHECK-NEXT: [[A1:%.*]] = lshr i32 [[A:%.*]], 16 +; CHECK-NEXT: [[B1:%.*]] = add nuw nsw i32 [[B:%.*]], 20 +; CHECK-NEXT: [[C:%.*]] = lshr exact i32 [[A1]], [[B1]] +; CHECK-NEXT: [[D:%.*]] = add nuw nsw i32 [[C]], 123 +; CHECK-NEXT: ret i32 123 + +define i32 @zero_test4(i32 %a, i32 %b) { + %a.small = lshr i32 %a, 16 + %b.large = add nuw nsw i32 %b, 20 + %c = lshr exact i32 %a.small, %b.large + %d = add i32 %c, 123 + ret i32 %d +}