Skip to content
80 changes: 80 additions & 0 deletions llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,8 @@ class StructurizeCFG {

RegionNode *PrevNode;

void reorderIfElseBlock(BasicBlock *BB, unsigned Idx);

void orderNodes();

void analyzeLoops(RegionNode *N);
Expand Down Expand Up @@ -409,6 +411,31 @@ class StructurizeCFGLegacyPass : public RegionPass {

} // end anonymous namespace

/// Helper function for heuristics to order if else block
/// Checks whether an instruction is potential vector copy instruction, if so,
/// checks if the operands are from different BB. if so, returns True.
// Then there's a possibility of coelescing without interference when ordered
// first.
static bool hasAffectingInstructions(Instruction *I, BasicBlock *BB) {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

if (!I || I->getParent() != BB)
return true;

// If the instruction is not a poterntial copy instructoin, return true.
if (!isa<ExtractElementInst>(*I) && !isa<ExtractValueInst>(*I))
return false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't understand looking for special case instruction types

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The best way these heuristics will work is if we know which are going to be copy instructions after lowering. Here I just look at these two because mostly these are getting lowered to copy instruction. Another way of calculating heuristics for ordering was to see whether the incoming phi value instruction defined by the operands of the same block or different block. but this is too broad and can have unintentional reorders.


// Check if any operands are instructions defined in the same block.
for (unsigned i = 0, e = I->getNumOperands(); i < e; ++i) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use range loop

if (auto *OpI = dyn_cast<Instruction>(I->getOperand(i))) {
if (OpI->getParent() == BB)
return false;
}
}

return true;
}

char StructurizeCFGLegacyPass::ID = 0;

INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
Expand All @@ -419,6 +446,58 @@ INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
"Structurize the CFG", false, false)

/// Then and Else block order in SCC is arbitrary. But based on the
/// order, after structurization there are cases where there might be extra
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just sort blocks in topological order or something? I'm not sure I understand this heuristic

Copy link
Contributor Author

@VigneshwarJ VigneshwarJ May 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The blocks are already sorted in the topological order. But in case of if-then-else, topologically then,else or else,then is same. But based on the order, there are extra copies that leads to spill in large cases.

This is specifically seen in case where one block just copies values and other block modifies values.
godbolt link

if:
  %vec = <4 x i32> ...
  br i1 %c %then %else
then:
  %x = extractelement <4 x i32> %vec, i32 0
  %z = add i32 %x, 1
  br label %merge
else:
  %a = extractelement <4 x i32> %vec, i32 0
  br label %merge
merge:
  %phi = phi i32 [ %z, %then ], [ %a, %else ]
  store i32 %phi, ptr  %ptr
  ret void

After structurization and machine code , if 'then' block is ordered first
then at register coalescer

if:
   [v0 - v4] = ...
    cond_branch_scc then
tmp:
    v5 = 0
    branch Flow
then:
   v5 = copy v0  ; eliminated in reg coalescer
   v5 = v5 + 1  ; v5 = v0 + 1
flow:
   v6 = v5 ; eliminated
   cond_branch final
else:
   v6 = copy v0 ;  v5 = copy v0 
 ; this copy v5 = copy v0 can't be coalesced further then->flow->else live range 
final:
   store v6 ; store v5

but at the same time, if 'else' block is ordered first,

if:
   [v0 - v4] = ...
    cond_branch_scc else
tmp:
    v5 = 0
    branch Flow
else:
   v5 = copy v0 ; eliminated
flow:
   v6 = copy v5 ; eliminated
   cond_branch final
then:
   v6 = copy v0  ; eliminated 
   v6 = v6 + 1  ; v6 = v0 + 1 ; can be coalesced to v0 = v0 + 1
final:
   store v6 ; coalesced to  store v0

Theres an extra copy just due to ordering in the first case. Though then and else block won't be live at same time due to structurization, we see this.

This heuristics just sees instructions that can be lowered to copy instructions

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anything can be lowered to copied, initially most cross block values emit copies. Does this really mean "0" cost instructions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, changed the code to check for zero cost instructions

/// VGPR copies due to interference during register coelescing.
/// eg:- incoming phi values from Else block contains only vgpr copies and
/// incoming phis in Then block has are some modification for the vgprs.
/// after structurization, there would be interference when coelesing when Then
/// block is ordered first. But those copies can be coelesced when Else is
/// ordered first.
///
/// This function checks the incoming phi values in the merge block and
/// orders based on the following heuristics of Then and Else block. Checks
/// whether an incoming phi can be potential copy instructions and if so
/// checks whether copy within the block or not.
/// Increases score if its a potential copy from outside the block.
/// the higher scored block is ordered first.
void StructurizeCFG::reorderIfElseBlock(BasicBlock *BB, unsigned Idx) {
BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());

if (Term && Term->isConditional()) {
BasicBlock *ThenBB = Term->getSuccessor(0);
BasicBlock *ElseBB = Term->getSuccessor(1);
BasicBlock *ThenSucc = ThenBB->getSingleSuccessor();

if (BB == ThenBB->getSinglePredecessor() &&
(ThenBB->getSinglePredecessor() == ElseBB->getSinglePredecessor()) &&
(ThenSucc && ThenSucc == ElseBB->getSingleSuccessor())) {
unsigned ThenScore = 0, ElseScore = 0;

for (PHINode &Phi : ThenSucc->phis()) {
Value *ThenVal = Phi.getIncomingValueForBlock(ThenBB);
Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);

if (auto *Inst = dyn_cast<Instruction>(ThenVal))
ThenScore += hasAffectingInstructions(Inst, ThenBB);
if (auto *Inst = dyn_cast<Instruction>(ElseVal))
ElseScore += hasAffectingInstructions(Inst, ElseBB);
}

if (ThenScore != ElseScore) {
if (ThenScore < ElseScore)
std::swap(ThenBB, ElseBB);

// reorder the last two inserted elements in Order
if (Idx >= 2 && Order[Idx - 1]->getEntry() == ElseBB &&
Order[Idx - 2]->getEntry() == ThenBB) {
std::swap(Order[Idx - 1], Order[Idx - 2]);
}
}
}
}
}

/// Build up the general order of nodes, by performing a topological sort of the
/// parent region's nodes, while ensuring that there is no outer cycle node
/// between any two inner cycle nodes.
Expand Down Expand Up @@ -452,6 +531,7 @@ void StructurizeCFG::orderNodes() {
// Add the SCC nodes to the Order array.
for (const auto &N : SCC) {
assert(I < E && "SCC size mismatch!");
reorderIfElseBlock(N.first->getEntry(), I);
Order[I++] = N.first;
}
}
Expand Down
129 changes: 129 additions & 0 deletions llvm/test/Transforms/StructurizeCFG/order-if-else.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s

define amdgpu_kernel void @test_extractelement_1(<4 x i32> %vec, i1 %cond, ptr %ptr) {
; CHECK-LABEL: define amdgpu_kernel void @test_extractelement_1(
; CHECK-SAME: <4 x i32> [[VEC:%.*]], i1 [[COND:%.*]], ptr [[PTR:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[COND_INV:%.*]] = xor i1 [[COND]], true
; CHECK-NEXT: br i1 [[COND_INV]], label %[[ELSE:.*]], label %[[FLOW:.*]]
; CHECK: [[FLOW]]:
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[A:%.*]], %[[ELSE]] ], [ poison, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[ENTRY]] ]
; CHECK-NEXT: br i1 [[TMP1]], label %[[THEN:.*]], label %[[MERGE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 0
; CHECK-NEXT: [[Z:%.*]] = add i32 [[X]], 1
; CHECK-NEXT: br label %[[MERGE]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[A]] = extractelement <4 x i32> [[VEC]], i32 1
; CHECK-NEXT: br label %[[FLOW]]
; CHECK: [[MERGE]]:
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[Z]], %[[THEN]] ]
; CHECK-NEXT: store i32 [[PHI]], ptr [[PTR]], align 4
; CHECK-NEXT: ret void
;
entry:
br i1 %cond, label %then, label %else

then:
%x = extractelement <4 x i32> %vec, i32 0
%z = add i32 %x, 1
br label %merge

else:
%a = extractelement <4 x i32> %vec, i32 1
br label %merge

merge:
%phi = phi i32 [ %z, %then ], [ %a, %else ]
store i32 %phi, ptr %ptr
ret void
}

define amdgpu_kernel void @test_extractelement_2(<4 x i32> %vec, i1 %cond, ptr %ptr) {
; CHECK-LABEL: define amdgpu_kernel void @test_extractelement_2(
; CHECK-SAME: <4 x i32> [[VEC:%.*]], i1 [[COND:%.*]], ptr [[PTR:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[COND_INV:%.*]] = xor i1 [[COND]], true
; CHECK-NEXT: br i1 [[COND_INV]], label %[[ELSE:.*]], label %[[FLOW:.*]]
; CHECK: [[FLOW]]:
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[A:%.*]], %[[ELSE]] ], [ poison, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[ELSE]] ], [ true, %[[ENTRY]] ]
; CHECK-NEXT: br i1 [[TMP1]], label %[[THEN:.*]], label %[[MERGE:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 1
; CHECK-NEXT: [[Y:%.*]] = add i32 [[X]], 1
; CHECK-NEXT: [[VEC1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i32 0
; CHECK-NEXT: [[Z:%.*]] = extractelement <4 x i32> [[VEC1]], i32 0
; CHECK-NEXT: br label %[[MERGE]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[A]] = extractelement <4 x i32> [[VEC]], i32 1
; CHECK-NEXT: br label %[[FLOW]]
; CHECK: [[MERGE]]:
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[Z]], %[[THEN]] ]
; CHECK-NEXT: store i32 [[PHI]], ptr [[PTR]], align 4
; CHECK-NEXT: ret void
;
entry:
br i1 %cond, label %then, label %else

then:
%x = extractelement <4 x i32> %vec, i32 1
%y = add i32 %x, 1
%vec1 = insertelement <4 x i32> poison, i32 %y, i32 0
%z = extractelement <4 x i32> %vec1, i32 0
br label %merge

else:
%a = extractelement <4 x i32> %vec, i32 1
br label %merge

merge:
%phi = phi i32 [ %z, %then ], [ %a, %else ]
store i32 %phi, ptr %ptr
ret void
}

%pair = type { i32, i32 }
define amdgpu_kernel void @test_extractvalue(ptr %ptr, i1 %cond) {
; CHECK-LABEL: define amdgpu_kernel void @test_extractvalue(
; CHECK-SAME: ptr [[PTR:%.*]], i1 [[COND:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[LOAD_THEN:%.*]] = load [[PAIR:%.*]], ptr [[PTR]], align 4
; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[FLOW:.*]]
; CHECK: [[THEN]]:
; CHECK-NEXT: [[A_THEN:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
; CHECK-NEXT: br label %[[FLOW]]
; CHECK: [[FLOW]]:
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[A_THEN]], %[[THEN]] ], [ poison, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[THEN]] ], [ true, %[[ENTRY]] ]
; CHECK-NEXT: br i1 [[TMP1]], label %[[ELSE:.*]], label %[[MERGE:.*]]
; CHECK: [[ELSE]]:
; CHECK-NEXT: [[A_ELSE:%.*]] = extractvalue [[PAIR]] [[LOAD_THEN]], 0
; CHECK-NEXT: [[SUM_ELSE:%.*]] = add i32 [[A_ELSE]], 1
; CHECK-NEXT: br label %[[MERGE]]
; CHECK: [[MERGE]]:
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[TMP0]], %[[FLOW]] ], [ [[SUM_ELSE]], %[[ELSE]] ]
; CHECK-NEXT: store i32 [[PHI]], ptr [[PTR]], align 4
; CHECK-NEXT: ret void
;
entry:
%load_then = load %pair, ptr %ptr
br i1 %cond, label %then, label %else

then:
%a_then = extractvalue %pair %load_then, 0
br label %merge

else:
%a_else = extractvalue %pair %load_then, 0
%sum_else = add i32 %a_else, 1
br label %merge

merge:
%phi = phi i32 [ %a_then, %then ], [ %sum_else, %else ]
store i32 %phi, ptr %ptr
ret void
}
Loading