Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 156 additions & 2 deletions llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Utils/Local.h"
#include <cassert>
#include <limits>
#include <optional>
#include <utility>

Expand All @@ -60,6 +61,8 @@ STATISTIC(NumUDivURemsNarrowed,
"Number of udivs/urems whose width was decreased");
STATISTIC(NumAShrsConverted, "Number of ashr converted to lshr");
STATISTIC(NumAShrsRemoved, "Number of ashr removed");
STATISTIC(NumLShrsRemoved, "Number of lshr removed");
STATISTIC(NumLShrsNarrowed, "Number of lshrs whose width was decreased");
STATISTIC(NumSRems, "Number of srem converted to urem");
STATISTIC(NumSExt, "Number of sext converted to zext");
STATISTIC(NumSIToFP, "Number of sitofp converted to uitofp");
Expand Down Expand Up @@ -93,6 +96,10 @@ STATISTIC(NumUDivURemsNarrowedExpanded,
"Number of bound udiv's/urem's expanded");
STATISTIC(NumNNeg, "Number of zext/uitofp non-negative deductions");

static cl::opt<bool>
NarrowLShr("correlated-propagation-narrow-lshr", cl::init(true), cl::Hidden,
cl::desc("Enable narrowing of LShr instructions."));

static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
if (Constant *C = LVI->getConstant(V, At))
return C;
Expand Down Expand Up @@ -751,6 +758,15 @@ static Domain getDomain(const ConstantRange &CR) {
return Domain::Unknown;
}

/// Returns the optimal bit width for the narrowed type.
/// If desiredBitWidth is greater that 32 returns max int and thus effectively
/// turns of the narrowing. For desiredBitWidth <= 32 returns i8, i16, i32.
static unsigned getNarrowedWidth(unsigned desiredBitWidth) {
return desiredBitWidth <= 32
? std::max<unsigned>(PowerOf2Ceil(desiredBitWidth), 8)
: std::numeric_limits<unsigned>::max();
}

/// Try to shrink a sdiv/srem's width down to the smallest power of two that's
/// sufficient to contain its operands.
static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR,
Expand All @@ -774,7 +790,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR,
++MinSignedBits;

// Don't shrink below 8 bits wide.
unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MinSignedBits), 8);
unsigned NewWidth = getNarrowedWidth(MinSignedBits);

// NewWidth might be greater than OrigWidth if OrigWidth is not a power of
// two.
Expand Down Expand Up @@ -892,7 +908,7 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR,
// of both of the operands?
unsigned MaxActiveBits = std::max(XCR.getActiveBits(), YCR.getActiveBits());
// Don't shrink below 8 bits wide.
unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
unsigned NewWidth = getNarrowedWidth(MaxActiveBits);

// NewWidth might be greater than OrigWidth if OrigWidth is not a power of
// two.
Expand Down Expand Up @@ -1067,6 +1083,136 @@ static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
return narrowSDivOrSRem(Instr, LCR, RCR);
}

/**
* @brief Narrows type of the LShr instruction if the range of the possible
* values fits into a smaller type. Since LShr is a relatively cheap
* instruction, the narrowing should not happen too frequently. Performance
* testing and compatibility with other passes indicate that the narrowing is
* beneficial under the following circumstances:
*
* i) the narrowing occurs only if all the users of the LShr instruction are
* already TruncInst;
*
* ii) the narrowing is carried out to the largest TruncInst following the LShr
* instruction.
*
* Additionally, the function optimizes the cases where the result of the LShr
* instruction is guaranteed to vanish or be equal to poison.
*/
static bool narrowLShr(BinaryOperator *LShr, LazyValueInfo *LVI) {

IntegerType *RetTy = dyn_cast<IntegerType>(LShr->getType());
if (!RetTy)
return false;

ConstantRange ArgRange = LVI->getConstantRangeAtUse(LShr->getOperandUse(0),
/*UndefAllowed*/ false);
ConstantRange ShiftRange = LVI->getConstantRangeAtUse(LShr->getOperandUse(1),
/*UndefAllowed*/ false);

unsigned OrigWidth = RetTy->getScalarSizeInBits();
unsigned MaxActiveBitsInArg = ArgRange.getActiveBits();
unsigned MinShiftValue = ShiftRange.getUnsignedMin().getActiveBits() <= 32
? static_cast<unsigned>(ShiftRange.getUnsignedMin().getZExtValue())
: std::numeric_limits<unsigned>::max();

// First we deal with the cases where the result is guaranteed to vanish or be
// equal to posion.

auto replaceWith = [&](Value *V) -> void {
LShr->replaceAllUsesWith(V);
LShr->eraseFromParent();
++NumLShrsRemoved;
};

// If the shift is larger or equal to the bit width of the argument,
// the instruction returns a poison value.
if (MinShiftValue >= OrigWidth) {
replaceWith(PoisonValue::get(RetTy));
return true;
}

// If we are guaranteed to shift away all bits,
// we replace the shift by the null value.
// We should not apply the optimization if LShr is exact,
// as the result may be poison.
if (!LShr->isExact() && MinShiftValue >= MaxActiveBitsInArg) {
replaceWith(Constant::getNullValue(RetTy));
return true;
}

// Since LShr returns poison if the shift is larger of equal that the bit
// width of the argument, we must make sure that the maximal possible value
// for the shift is larger than the new width after narrowing. Otherwise some
// shifts that originally vanish would result in poison after the narrowing.
unsigned MaxShiftValue = ShiftRange.getUnsignedMax().getActiveBits() <= 32
? static_cast<unsigned>(ShiftRange.getUnsignedMax().getZExtValue())
: std::numeric_limits<unsigned>::max();
if (OrigWidth <= MaxShiftValue)
return false;

// That's how many bits we need.
unsigned MaxActiveBits =
std::max(MaxActiveBitsInArg, ShiftRange.getActiveBits());

// We could do better, but is it worth it?
// With the first argument being the n-bit integer, we may limit the value of
// the second argument to be less than n, as larger shifts would lead to a
// vanishing result or poison. Thus the number of bits in the second argument
// is limited by Log2(n). Unfortunately, this would require an introduction of
// a select instruction (or llvm.min) to make sure every argument larger than
// n is mapped to n and not just truncated. We do not implement it here.

// What is the smallest bit width that can accommodate the entire value ranges
// of both of the operands? Don't shrink below 8 bits wide.
unsigned NewWidth = getNarrowedWidth(MaxActiveBits);

// NewWidth might be greater than OrigWidth if OrigWidth is not a power of
// two.
if (NewWidth >= OrigWidth)
return false;

// This is the time to check if all the users are TruncInst
// and to figure out what the largest user is.
for (User *user : LShr->users()) {
if (TruncInst *TI = dyn_cast<TruncInst>(user)) {
NewWidth = std::max(NewWidth, TI->getDestTy()->getScalarSizeInBits());
} else {
return false;
}
}

// See comment above MaxShiftValue.
if (NewWidth <= MaxShiftValue)
return false;

// We are ready to truncate.
IRBuilder<> B(LShr);
Type *TruncTy = RetTy->getWithNewBitWidth(NewWidth);
Value *ArgTrunc = B.CreateTruncOrBitCast(LShr->getOperand(0), TruncTy,
LShr->getName() + ".arg.trunc");
Value *ShiftTrunc = B.CreateTruncOrBitCast(LShr->getOperand(1), TruncTy,
LShr->getName() + ".shift.trunc");
Value *LShrTrunc =
B.CreateBinOp(LShr->getOpcode(), ArgTrunc, ShiftTrunc, LShr->getName());
Value *Zext = B.CreateZExt(LShrTrunc, RetTy, LShr->getName() + ".zext");

// Should always cast, but better safe than sorry.
if (BinaryOperator *LShrTruncBO = dyn_cast<BinaryOperator>(LShrTrunc)) {
LShrTruncBO->setDebugLoc(LShr->getDebugLoc());
LShrTruncBO->setIsExact(LShr->isExact());
}
LShr->replaceAllUsesWith(Zext);
LShr->eraseFromParent();

++NumLShrsNarrowed;
return true;
}

static bool processLShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
return NarrowLShr ? narrowLShr(SDI, LVI) : false;
}

static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
ConstantRange LRange =
LVI->getConstantRangeAtUse(SDI->getOperandUse(0), /*UndefAllowed*/ false);
Expand All @@ -1093,6 +1239,11 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
SDI->replaceAllUsesWith(BO);
SDI->eraseFromParent();

// Check if the new LShr can be narrowed.
if (NarrowLShr) {
narrowLShr(BO, LVI);
}

return true;
}

Expand Down Expand Up @@ -1254,6 +1405,9 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
case Instruction::AShr:
BBChanged |= processAShr(cast<BinaryOperator>(&II), LVI);
break;
case Instruction::LShr:
BBChanged |= processLShr(cast<BinaryOperator>(&II), LVI);
break;
case Instruction::SExt:
BBChanged |= processSExt(cast<SExtInst>(&II), LVI);
break;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
; RUN: opt < %s -passes="correlated-propagation,instcombine" -S | FileCheck %s

; The tests below are the same as in lshr.ll
; Here we test whether the CorrelatedValuePropagation pass
; composed with InstCombinePass produces the expected optimizations.

; CHECK-LABEL: @trunc_test1
; CHECK-NEXT: [[A1:%.*]] = lshr i32 [[A:%.*]], 16
; CHECK-NEXT: [[CARG:%.*]] = trunc nuw i32 [[A1]] to i16
; CHECK-NEXT: [[B1:%.*]] = trunc i32 [[B:%.*]] to i16
; CHECK-NEXT: [[CSHIFT:%.*]] = and i16 [[B1]], 15
; CHECK-NEXT: [[C1:%.*]] = lshr i16 [[CARG]], [[CSHIFT]]
; CHECK-NEXT: ret i16 [[C1]]

define i16 @trunc_test1(i32 %a, i32 %b) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you demonstrate that this patch will improve the codegen? https://godbolt.org/z/hbP9qrsxv

BTW this transformation is incorrect: https://alive2.llvm.org/ce/z/j2w5Wa

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. The original motivation is a subtle difference between the division and right shift. Due to the lack of narrowing for LShr, UDivs may produce a better code, here's an example:

https://godbolt.org/z/7EPnj9a8d

With the proposed patch the two functions generate identical (better) code.

  1. The incorrect transformation was fixed by checking if the maximal shift is smaller than the new bit width.

%a.eff.trunc = lshr i32 %a, 16
%b.eff.trunc = and i32 %b, 15
%c = lshr i32 %a.eff.trunc, %b.eff.trunc
%c.trunc = trunc i32 %c to i16
ret i16 %c.trunc
}

; CHECK-LABEL: @trunc_test2
; CHECK-NEXT: [[C1:%.*]] = lshr i16 [[A:%.*]], 2
; CHECK-NEXT: ret i16 [[C1]]

define i16 @trunc_test2(i16 %a) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has been handled by InstCombine (SimplifyDemandedBits?): https://godbolt.org/z/143fs31s4

%a.ext = zext i16 %a to i32
%c = lshr i32 %a.ext, 2
%c.trunc = trunc i32 %c to i16
ret i16 %c.trunc
}

; CHECK-LABEL: @trunc_test3
; CHECK-NEXT: [[B:%.*]] = lshr i16 [[A:%.*]], 2
; CHECK-NEXT: [[C:%.*]] = add nuw nsw i16 [[B]], 123
; CHECK-NEXT: ret i16 [[C]]

define i16 @trunc_test3(i16 %a) {
%a.ext = zext i16 %a to i32
%b = lshr i32 %a.ext, 2
%c = add i32 %b, 123
%c.trunc = trunc i32 %c to i16
ret i16 %c.trunc
}

; CHECK-LABEL: @trunc_test4
; CHECK-NEXT: [[A1:%.*]] = udiv i32 [[A:%.*]], 17000000
; CHECK-NEXT: [[B:%.*]] = trunc nuw nsw i32 [[A1]] to i16
; CHECK-NEXT: [[B1:%.*]] = lshr i16 [[B]], 2
; CHECK-NEXT: ret i16 [[B1]]

define i16 @trunc_test4(i32 %a) {
%a.eff.trunc = udiv i32 %a, 17000000 ; larger than 2^24
%b = lshr i32 %a.eff.trunc, 2
%b.trunc.1 = trunc i32 %b to i16
%b.trunc.2 = trunc i32 %b to i8
ret i16 %b.trunc.1
}

; CHECK-LABEL: @trunc_test5
; CHECK-NEXT: [[A1:%.*]] = udiv i32 [[A:%.*]], 17000000
; CHECK-NEXT: [[B:%.*]] = lshr i32 [[A1]], 2
; CHECK-NEXT: [[C:%.*]] = add nuw nsw i32 [[B]], 123
; CHECK-NEXT: ret i32 [[C]]

define i32 @trunc_test5(i32 %a) {
%a.eff.trunc = udiv i32 %a, 17000000 ; larger than 2^24
%b = lshr i32 %a.eff.trunc, 2
%b.trunc.1 = trunc i32 %b to i16
%b.trunc.2 = trunc i32 %b to i8
%c = add i32 %b, 123
ret i32 %c
}

; CHECK-LABEL: @zero_test1
; CHECK-NEXT: ret i32 poison

define i32 @zero_test1(i32 %a) {
%b = lshr i32 %a, 32
%c = add i32 %b, 123
ret i32 %c
}

; CHECK-LABEL: @zero_test2
; CHECK-NEXT: ret i32 poison

define i32 @zero_test2(i32 %a, i32 %b) {
%b.large = add nuw nsw i32 %b, 50
%c = lshr i32 %a, %b.large
%d = add i32 %c, 123
ret i32 %d
}

; CHECK-LABEL: @zero_test3
; CHECK-NEXT: ret i32 123

define i32 @zero_test3(i32 %a, i32 %b) {
%a.small = lshr i32 %a, 16
%b.large = add nuw nsw i32 %b, 20
%c = lshr i32 %a.small, %b.large
%d = add i32 %c, 123
ret i32 %d
}

; CHECK-LABEL: @zero_test4
; CHECK-NEXT: ret i32 123

define i32 @zero_test4(i32 %a, i32 %b) {
%a.small = lshr i32 %a, 16
%b.large = add nuw nsw i32 %b, 20
%c = lshr exact i32 %a.small, %b.large
%d = add i32 %c, 123
ret i32 %d
}
Loading
Loading