Skip to content
Merged
42 changes: 36 additions & 6 deletions llvm/lib/Transforms/Utils/SimplifyCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1663,18 +1663,29 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
static void hoistConditionalLoadsStores(
BranchInst *BI,
SmallVectorImpl<Instruction *> &SpeculatedConditionalLoadsStores,
bool Invert) {
std::optional<bool> Invert) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment like

\param Invert  ...

?

It's a little hard to know when it's nullopt w/o searching for the caller.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

auto &Context = BI->getParent()->getContext();
auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
auto *Cond = BI->getOperand(0);
// Construct the condition if needed.
BasicBlock *BB = BI->getParent();
IRBuilder<> Builder(SpeculatedConditionalLoadsStores.back());
Value *Mask = Builder.CreateBitCast(
Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
VCondTy);
IRBuilder<> Builder(Invert ? SpeculatedConditionalLoadsStores.back() : BI);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems we don't need to select the insertion point by condition. Always use BI? Same for line 1686

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, cannot. Because in the triangle case, the SpeculatedConditionalLoadsStores was pushed in reverse order, we must get the insertion point one by one for it, otherwise, we will get something like

define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) #0 {
entry:
  %0 = bitcast i1 %cond to <1 x i1>
  %1 = bitcast i64 %5 to <1 x i64>
  call void @llvm.masked.store.v1i64.p0(<1 x i64> %1, ptr %q, i32 8, <1 x i1> %0)
  %2 = bitcast i32 %7 to <1 x i32>
  call void @llvm.masked.store.v1i32.p0(<1 x i32> %2, ptr %p, i32 4, <1 x i1> %0)
  %3 = bitcast i16 %9 to <1 x i16>
  call void @llvm.masked.store.v1i16.p0(<1 x i16> %3, ptr %b, i32 2, <1 x i1> %0)
  %4 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %b, i32 8, <1 x i1> %0, <1 x i64> poison)
  %5 = bitcast <1 x i64> %4 to i64
  %6 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %q, i32 4, <1 x i1> %0, <1 x i32> poison)
  %7 = bitcast <1 x i32> %6 to i32
  %8 = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr %p, i32 2, <1 x i1> %0, <1 x i16> poison)
  %9 = bitcast <1 x i16> %8 to i16
  ret void
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given its an optional<bool> - would it be easier to grok if you used Invert.has_value() ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Value *Mask = nullptr;
Value *Mask0 = nullptr;
Value *Mask1 = nullptr;
if (Invert) {
Mask = Builder.CreateBitCast(
*Invert ? Builder.CreateXor(Cond, ConstantInt::getTrue(Context)) : Cond,
VCondTy);
} else {
Mask0 = Builder.CreateBitCast(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering we name the BB as TrueBB and FalseBB in the comment, maybe
Mask0 -> MaskFalse
Mask1 -> MaskTrue

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
Mask1 = Builder.CreateBitCast(Cond, VCondTy);
}
for (auto *I : SpeculatedConditionalLoadsStores) {
IRBuilder<> Builder(I);
IRBuilder<> Builder(Invert ? I : BI);
if (!Invert)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if (!Mask) looks better?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, we cannot. The Mask is not a nullptr in the second iteration.

Mask = I->getParent() == BI->getSuccessor(0) ? Mask1 : Mask0;
// We currently assume conditional faulting load/store is supported for
// scalar types only when creating new instructions. This can be easily
// extended for vector types in the future.
Expand Down Expand Up @@ -1771,6 +1782,25 @@ bool SimplifyCFGOpt::hoistCommonCodeFromSuccessors(Instruction *TI,
if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
return false;

auto *BI = dyn_cast<BranchInst>(TI);
if (BI && HoistLoadsStoresWithCondFaulting &&
Options.HoistLoadsStoresWithCondFaulting) {
SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
for (auto *Succ : successors(BB)) {
for (Instruction &I : drop_end(*Succ)) {
if (!isSafeCheapLoadStore(&I, TTI) ||
SpeculatedConditionalLoadsStores.size() ==
HoistLoadsStoresWithCondFaultingThreshold)
return false;
SpeculatedConditionalLoadsStores.push_back(&I);
}
}

if (!SpeculatedConditionalLoadsStores.empty())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks suspicious here since the following checks are skipped. We should add tests to check what happen if there is non-load/store instructions in the successors.

Copy link
Contributor Author

@phoebewang phoebewang Sep 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The assumption here is prior passes have moved common instructions out of branches. It works in a pipeline, e.g.,

$ cat single_predecessor.ll
define i32 @single_predecessor(ptr %p, ptr %q, i32 %x, i32 %a, i32 %b) {
entry:
  %tobool = icmp ne i32 %x, 0
  br i1 %tobool, label %if.end, label %if.then
if.end:
  store i32 1, ptr %q
  %c = add i32 %a, %b ; <== common instruction
  ret i32 %c
if.then:
  %0 = load i32, ptr %q
  store i32 %0, ptr %p
  %d = add i32 %a, %b ; <== common instruction
  ret i32 %d
}

$ opt -passes=simplifycfg,'instcombine<no-verify-fixpoint>','simplifycfg<hoist-loads-stores-with-cond-faulting>' -mtriple=x86_64 -mattr=+cf single_predecessor.ll -S -o -
; ModuleID = 'single_predecessor.ll'
source_filename = "single_predecessor.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64"

define i32 @single_predecessor(ptr %p, ptr %q, i32 %x, i32 %a, i32 %b) #0 {
entry:
  %tobool.not = icmp eq i32 %x, 0
  %0 = xor i1 %tobool.not, true
  %1 = bitcast i1 %0 to <1 x i1>
  %2 = bitcast i1 %tobool.not to <1 x i1>
  %3 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %q, i32 4, <1 x i1> %2, <1 x i32> poison)
  %4 = bitcast <1 x i32> %3 to i32
  call void @llvm.masked.store.v1i32.p0(<1 x i32> %3, ptr %p, i32 4, <1 x i1> %2)
  call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr %q, i32 4, <1 x i1> %1)
  %common.ret.op = add i32 %a, %b
  ret i32 %common.ret.op
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering current implementation separates the common instr hoist from load/store hoist,
probably we should move the code to

  if (BI->getSuccessor(0)->getSinglePredecessor()) {
    if (BI->getSuccessor(1)->getSinglePredecessor()) {
      if (HoistCommon && hoistCommonCodeFromSuccessors(
                             BI->getParent(), !Options.HoistCommonInsts))
        return requestResimplify();

     if  (HoistLoadsStoresWithCondFaulting &&
          Options.HoistLoadsStoresWithCondFaulting && hoistConditionalLoadsStores(...)) 
          return requestResimplify();

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's necessary given the last simplifycfg is supposed to clean up single-entry-single-exit or empty blocks.
https://github.com/llvm/llvm-project/blob/main/llvm/lib/Passes/PassBuilderPipelines.cpp#L1533-L1534

If we have such pattens, we should have optimized in previous simplifycfgs:

$ clang -S apx.c -mapxf -mllvm -print-pipeline-passes -O1 | sed -e 's/,/\n/g' | sed -e 's/;no-[^;>]*//g' | grep ^simplifycfg
simplifycfg<bonus-inst-threshold=1;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>)
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch>
simplifycfg<bonus-inst-threshold=1;switch-range-to-icmp;keep-loops;hoist-loads-stores-with-cond-faulting;speculate-blocks;simplify-cond-branch;speculate-unpredictables>)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't mean to clean the empty bb by ourselves. I meant the logic of hoisting load/store does not match the name and comment of hositCommonCodeFromSuccessors. Probably we should move it out.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, done.

hoistConditionalLoadsStores(BI, SpeculatedConditionalLoadsStores,
std::nullopt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we return here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good catch!

}

// The second of pair is a SkipFlags bitmask.
using SuccIterPair = std::pair<BasicBlock::iterator, unsigned>;
SmallVector<SuccIterPair, 8> SuccIterPairs;
Expand Down
46 changes: 36 additions & 10 deletions llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
Original file line number Diff line number Diff line change
Expand Up @@ -276,21 +276,19 @@ if.false: ; preds = %if.true, %entry
}

;; Both of successor 0 and successor 1 have a single predecessor.
;; TODO: Support transform for this case.
define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
; CHECK-LABEL: @single_predecessor(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
; CHECK: common.ret:
; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can use TMP3 here. Two bitcasts looks redundant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

; CHECK-NEXT: ret void
; CHECK: if.end:
; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
; CHECK-NEXT: br label [[COMMON_RET:%.*]]
; CHECK: if.then:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
; CHECK-NEXT: br label [[COMMON_RET]]
;
entry:
%tobool = icmp ne i32 %a, 0
Expand Down Expand Up @@ -728,6 +726,34 @@ if.true:
ret i32 %res
}

define void @diamondCFG(i32 %a, ptr %c, ptr %d) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably remove this? This function tests the same thing as single_predecessor

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

; CHECK-LABEL: @diamondCFG(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[A:%.*]], 0
; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[TOBOOL_NOT]], true
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TOBOOL_NOT]] to <1 x i1>
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> zeroinitializer, ptr [[D:%.*]], i32 4, <1 x i1> [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[A]] to <1 x i32>
; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[C:%.*]], i32 4, <1 x i1> [[TMP1]])
; CHECK-NEXT: ret void
;
entry:
%tobool.not = icmp eq i32 %a, 0
br i1 %tobool.not, label %if.else, label %if.then

if.then: ; preds = %entry
store i32 %a, ptr %c, align 4
br label %if.end

if.else: ; preds = %entry
store i32 0, ptr %d, align 4
br label %if.end

if.end: ; preds = %if.else, %if.then
ret void
}

declare i32 @read_memory_only() readonly nounwind willreturn speculatable

!llvm.dbg.cu = !{!0}
Expand Down