Skip to content

Commit 0f08c51

Browse files
vladimirradosavljevicakiramenai
authored andcommitted
[EraVM] Fold arithmetic and bitwise instructions to select
This patch expands support to fold arithmetic and bitwise instructions to select, similar to the existing support for folding add to select. It also refactors how new instruction is generated. Instead of building it, we can clone original instruction and change some operands. Signed-off-by: Vladimir Radosavljevic <[email protected]>
1 parent 9877225 commit 0f08c51

14 files changed

+285
-302
lines changed

llvm/lib/Target/EraVM/EraVMOptimizeSelectPreRA.cpp

Lines changed: 126 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
//
99
// This file implements a pre-ra pass to optimize SELECT instructions. It tries
1010
// to fold SELECT with its user in case one of the input operands of SELECT
11-
// is immediate zero and it tries to fold ADD with SELECT if one of the inputs
12-
// of ADD matches with the input of SELECT.
11+
// is immediate zero and it tries to fold arithmetic and bitwise instructions
12+
// with SELECT if one of the inputs of these instructions matches with the input
13+
// of SELECT.
1314
//
1415
//===----------------------------------------------------------------------===//
1516

@@ -82,6 +83,34 @@ class EraVMOptimizeSelectPreRA : public MachineFunctionPass {
8283
Register In1, unsigned NewOpcode, unsigned NewCC,
8384
Register TieReg) const;
8485

86+
/// Fold MI into SelectMI and return a folded instruction. To do this,
87+
/// we clone MI and placing new instruction just before the select,
88+
/// change its output with the output register of the SelectMI
89+
/// instruction, update conditional code with NewCC, tie the TieReg
90+
/// with the output register and add implicit flags because we are
91+
/// updating conditional code.
92+
///
93+
/// Example:
94+
///
95+
/// folding
96+
/// %2:gr256 = ADDrrr_s %0:gr256, %1:gr256, i256 0
97+
/// %4:gr256 = SELrrr %2:gr256, %0:gr256, i256 2, implicit $flags
98+
/// into
99+
/// %4:gr256 = ADDrrr_s %0:gr256, %1:gr256, 2,
100+
/// implicit %0:gr256(tied-def 0), implicit $flags
101+
///
102+
MachineInstr *foldToSelect(MachineInstr &MI, MachineInstr &SelectMI,
103+
unsigned NewCC, Register TieReg) const;
104+
105+
/// Return an out register of a folding-candidate MI. In case MI has two
106+
/// outputs, we can only fold instruction iff one output is used and the
107+
/// other is not.
108+
/// In case both outputs are used and one of them is used by a SELECT,
109+
/// folding will make this instruction to execute conditionally meaning
110+
/// that other output will be calculated iff the condition is met, thus
111+
/// the folded instruction will not always produce the expected results.
112+
Register getOutRegToFold(const MachineInstr &MI) const;
113+
85114
/// In case one of the input operand of SELECT is immediate zero, then it is
86115
/// possible to fold it with its sole user to result in just one folded
87116
/// instruction.
@@ -98,9 +127,10 @@ class EraVMOptimizeSelectPreRA : public MachineFunctionPass {
98127
/// if z and y can be allocated to same reg. The tie is used to ensure this.
99128
bool tryFoldSelectZero(MachineBasicBlock &MBB);
100129

101-
/// In case ADD is fed into a SELECT, and one of the inputs of ADD matches
102-
/// with the input of SELECT, then it is possible to fold ADD with SELECT
103-
/// to result in just one folded instruction.
130+
/// In case an arithmetic or a bitwise instruction is fed into a SELECT, and
131+
/// one of the inputs of these instructions matches with the input of SELECT,
132+
/// then it is possible to fold it with SELECT to result in just one folded
133+
/// instruction.
104134
///
105135
/// A typical case is like below:
106136
///
@@ -113,7 +143,7 @@ class EraVMOptimizeSelectPreRA : public MachineFunctionPass {
113143
///
114144
/// if outSEL and x/y can be allocated to same reg.
115145
/// The tie is used to ensure this.
116-
bool tryFoldAddToSelect(MachineBasicBlock &MBB);
146+
bool tryFoldToSelect(MachineBasicBlock &MBB);
117147
};
118148

119149
char EraVMOptimizeSelectPreRA::ID = 0;
@@ -208,6 +238,67 @@ MachineInstr *EraVMOptimizeSelectPreRA::getFoldedInst(
208238
return NewMI;
209239
}
210240

241+
MachineInstr *EraVMOptimizeSelectPreRA::foldToSelect(MachineInstr &MI,
242+
MachineInstr &SelectMI,
243+
unsigned NewCC,
244+
Register TieReg) const {
245+
// Clone the original instruction.
246+
auto &MF = *MI.getMF();
247+
auto *NewMI = MF.CloneMachineInstr(&MI);
248+
249+
// Place the new instruction right before the SELECT instruction.
250+
SelectMI.getParent()->insert(SelectMI, NewMI);
251+
252+
Register OutReg = getOutRegToFold(*NewMI);
253+
assert(OutReg != EraVM::NoRegister && "Unexpected output register.");
254+
255+
// Find the def that needs to be updated.
256+
auto *DefMOToUpdate =
257+
llvm::find_if(NewMI->defs(), [OutReg](MachineOperand &MO) {
258+
return MO.getReg() == OutReg;
259+
});
260+
assert(DefMOToUpdate != NewMI->defs().end() &&
261+
"Didn't find the def register to update.");
262+
263+
// Change the def register to the def register of the select instruction.
264+
DefMOToUpdate->setReg(SelectMI.getOperand(0).getReg());
265+
266+
// Get the def operand idx to tie.
267+
unsigned DefIdxToTie = DefMOToUpdate->getOperandNo();
268+
269+
EraVM::ccIterator(*NewMI)->ChangeToImmediate(NewCC);
270+
MachineInstrBuilder(MF, NewMI)
271+
.addReg(TieReg, RegState::Implicit)
272+
.addReg(EraVM::Flags, RegState::Implicit);
273+
274+
// Add tie to ensure those two operands will get same reg
275+
// after RA pass. This is the key to make transformation in this pass
276+
// correct.
277+
NewMI->tieOperands(DefIdxToTie, NewMI->getNumOperands() - 2);
278+
return NewMI;
279+
}
280+
281+
Register
282+
EraVMOptimizeSelectPreRA::getOutRegToFold(const MachineInstr &MI) const {
283+
assert((MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2) &&
284+
"Unexpected number of register outputs");
285+
286+
if (MI.getNumExplicitDefs() == 2) {
287+
// In case both outputs are used, bail out.
288+
if (!RegInfo->use_nodbg_empty(EraVM::out0Iterator(MI)->getReg()) &&
289+
!RegInfo->use_nodbg_empty(EraVM::out1Iterator(MI)->getReg()))
290+
return EraVM::NoRegister;
291+
292+
// Return the output that should have the use.
293+
return RegInfo->use_nodbg_empty(EraVM::out0Iterator(MI)->getReg())
294+
? EraVM::out1Iterator(MI)->getReg()
295+
: EraVM::out0Iterator(MI)->getReg();
296+
}
297+
298+
// Return an output for single register output.
299+
return EraVM::out0Iterator(MI)->getReg();
300+
}
301+
211302
bool EraVMOptimizeSelectPreRA::tryFoldSelectZero(MachineBasicBlock &MBB) {
212303
SmallPtrSet<MachineInstr *, 4> ToRemove;
213304
for (auto &MI : MBB) {
@@ -297,58 +388,59 @@ bool EraVMOptimizeSelectPreRA::tryFoldSelectZero(MachineBasicBlock &MBB) {
297388
return !ToRemove.empty();
298389
}
299390

300-
bool EraVMOptimizeSelectPreRA::tryFoldAddToSelect(MachineBasicBlock &MBB) {
391+
bool EraVMOptimizeSelectPreRA::tryFoldToSelect(MachineBasicBlock &MBB) {
301392
SmallVector<MachineInstr *, 16> ToRemove;
302393
SmallPtrSet<MachineInstr *, 16> UsesToUpdate;
303394

304-
// 1. Collect all instructions to be combined.
305395
for (auto &MI : MBB) {
306-
if (!TII->isAdd(MI) || TII->getCCCode(MI) != EraVMCC::COND_NONE ||
396+
if ((!TII->isArithmetic(MI) && !TII->isBitwise(MI)) ||
397+
TII->getCCCode(MI) != EraVMCC::COND_NONE ||
307398
EraVMInstrInfo::isFlagSettingInstruction(MI) ||
308399
!EraVM::hasRROutAddressingMode(MI))
309400
continue;
310401

311-
// It's expected that if there are more uses of add, it's very unlikely that
402+
// If there are more than one output, we have to make sure the
403+
// other output is not used before we can fold this instruction.
404+
Register OutReg = getOutRegToFold(MI);
405+
if (OutReg == EraVM::NoRegister)
406+
continue;
407+
408+
// It's expected that if there are more uses, it's very unlikely that
312409
// all of them are select instruction where folding is feasible.
313-
Register OutAddReg = EraVM::out0Iterator(MI)->getReg();
314-
if (!RegInfo->hasOneNonDBGUser(OutAddReg))
410+
if (!RegInfo->hasOneNonDBGUser(OutReg))
315411
continue;
316412

317-
MachineInstr &UseMI = *RegInfo->use_instr_nodbg_begin(OutAddReg);
413+
MachineInstr &UseMI = *RegInfo->use_instr_nodbg_begin(OutReg);
318414
if (UsesToUpdate.count(&UseMI))
319415
continue;
320416

321-
SmallSet<Register, 2> InAddRegs;
322-
InAddRegs.insert(EraVM::in1Iterator(MI)->getReg());
417+
SmallSet<Register, 2> InRegs;
418+
InRegs.insert(EraVM::in1Iterator(MI)->getReg());
323419
if (EraVM::hasRRInAddressingMode(MI))
324-
InAddRegs.insert(EraVM::in0Iterator(MI)->getReg());
420+
InRegs.insert(EraVM::in0Iterator(MI)->getReg());
325421

326-
// In order to fold add to select, we expect that other input of a select
327-
// instruction is matching with one of the add inputs.
422+
// In order to fold, we expect that other input of a select instruction
423+
// is matching with one of the MI inputs.
328424
if (UseMI.getOpcode() != EraVM::SELrrr ||
329-
(!InAddRegs.count(EraVM::in0Iterator(UseMI)->getReg()) &&
330-
!InAddRegs.count(EraVM::in1Iterator(UseMI)->getReg())))
425+
(!InRegs.count(EraVM::in0Iterator(UseMI)->getReg()) &&
426+
!InRegs.count(EraVM::in1Iterator(UseMI)->getReg())))
331427
continue;
332428

333-
bool OutAddIsIn1Use = EraVM::out0Iterator(MI)->getReg() ==
334-
EraVM::in1Iterator(UseMI)->getReg();
429+
bool OutIsIn1Use = OutReg == EraVM::in1Iterator(UseMI)->getReg();
335430
auto CC = getImmOrCImm(*EraVM::ccIterator(UseMI));
336431

337432
// The COND_OF is overflow LT which hasn't reversal version, so we don't
338-
// attempt to inverse it.
339-
if (OutAddIsIn1Use && CC == EraVMCC::COND_OF)
433+
// attempt to inverse it.
434+
if (OutIsIn1Use && CC == EraVMCC::COND_OF)
340435
continue;
341436

342-
auto CCNewMI = OutAddIsIn1Use ? InverseCond[CC] : CC;
343-
Register TieReg = OutAddIsIn1Use ? EraVM::in0Iterator(UseMI)->getReg()
344-
: EraVM::in1Iterator(UseMI)->getReg();
437+
auto CCNewMI = OutIsIn1Use ? InverseCond[CC] : CC;
438+
Register TieReg = OutIsIn1Use ? EraVM::in0Iterator(UseMI)->getReg()
439+
: EraVM::in1Iterator(UseMI)->getReg();
345440

346-
[[maybe_unused]] auto *NewMI = getFoldedInst(
347-
UseMI, MI, EraVM::in0Range(MI), EraVM::in1Iterator(MI)->getReg(),
348-
MI.getOpcode(), CCNewMI, TieReg);
349-
LLVM_DEBUG(dbgs() << "== Folding add:"; MI.dump();
350-
dbgs() << " and use:"; UseMI.dump();
351-
dbgs() << " into:"; NewMI->dump(););
441+
[[maybe_unused]] auto *NewMI = foldToSelect(MI, UseMI, CCNewMI, TieReg);
442+
LLVM_DEBUG(dbgs() << "== Folding:"; MI.dump(); dbgs() << " and use:";
443+
UseMI.dump(); dbgs() << " into:"; NewMI->dump(););
352444

353445
UsesToUpdate.insert(&UseMI);
354446
ToRemove.emplace_back(&MI);
@@ -373,7 +465,7 @@ bool EraVMOptimizeSelectPreRA::runOnMachineFunction(MachineFunction &MF) {
373465
bool Changed = false;
374466
for (MachineBasicBlock &MBB : MF) {
375467
Changed |= tryFoldSelectZero(MBB);
376-
Changed |= tryFoldAddToSelect(MBB);
468+
Changed |= tryFoldToSelect(MBB);
377469
}
378470
return Changed;
379471
}

llvm/test/CodeGen/EraVM/combine-flag-setting.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ define i1 @NoCombine(i256 %p1, i1 %sel, i256 %random) nounwind {
375375
%val = load i256, i256* %valptr
376376
%p2 = xor i256 %val, %p1
377377
; We cannot combine xor with icmp because `select` will overwrite flags
378-
; CHECK: xor stack-[1], r{{[0-9]+}}, r{{[0-9]+}}
378+
; CHECK: xor.eq stack-[1], r{{[0-9]+}}, r{{[0-9]+}}
379379
%s = select i1 %sel, i256 %p1, i256 %p2
380380
%cmp = icmp eq i256 %s, 0
381381
ret i1 %cmp

llvm/test/CodeGen/EraVM/fold-and-to-select.ll

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@ target triple = "eravm"
77
define i256 @test_large_imm1(i256 %a) {
88
; CHECK-LABEL: test_large_imm1:
99
; CHECK: ; %bb.0:
10-
; CHECK-NEXT: and @CPI0_0[0], r1, r2
11-
; CHECK-NEXT: sub.s! @CPI0_1[0], r1, r3
12-
; CHECK-NEXT: add.ge r1, r0, r2
13-
; CHECK-NEXT: add r2, r0, r1
10+
; CHECK-NEXT: sub.s! @CPI0_1[0], r1, r2
11+
; CHECK-NEXT: and.lt @CPI0_0[0], r1, r1
1412
; CHECK-NEXT: ret
1513
%and = and i256 26959946660873538059280334323183841250350249843923952699046031785980, %a
1614
%cmp = icmp ult i256 %a, -26959946660873538059280334323183841250350249843923952699046031785985
@@ -21,9 +19,8 @@ define i256 @test_large_imm1(i256 %a) {
2119
define i256 @test_large_imm2(i256 %a) {
2220
; CHECK-LABEL: test_large_imm2:
2321
; CHECK: ; %bb.0:
24-
; CHECK-NEXT: and @CPI1_0[0], r1, r2
25-
; CHECK-NEXT: sub.s! @CPI1_1[0], r1, r3
26-
; CHECK-NEXT: add.ge r2, r0, r1
22+
; CHECK-NEXT: sub.s! @CPI1_1[0], r1, r2
23+
; CHECK-NEXT: and.ge @CPI1_0[0], r1, r1
2724
; CHECK-NEXT: ret
2825
%and = and i256 26959946660873538059280334323183841250350249843923952699046031785980, %a
2926
%cmp = icmp ult i256 %a, -26959946660873538059280334323183841250350249843923952699046031785985
@@ -34,10 +31,8 @@ define i256 @test_large_imm2(i256 %a) {
3431
define i256 @test_small_imm1(i256 %a) {
3532
; CHECK-LABEL: test_small_imm1:
3633
; CHECK: ; %bb.0:
37-
; CHECK-NEXT: and 10, r1, r2
38-
; CHECK-NEXT: sub.s! @CPI2_0[0], r1, r3
39-
; CHECK-NEXT: add.ge r1, r0, r2
40-
; CHECK-NEXT: add r2, r0, r1
34+
; CHECK-NEXT: sub.s! @CPI2_0[0], r1, r2
35+
; CHECK-NEXT: and.lt 10, r1, r1
4136
; CHECK-NEXT: ret
4237
%and = and i256 10, %a
4338
%cmp = icmp ult i256 %a, -5
@@ -48,9 +43,8 @@ define i256 @test_small_imm1(i256 %a) {
4843
define i256 @test_small_imm2(i256 %a) {
4944
; CHECK-LABEL: test_small_imm2:
5045
; CHECK: ; %bb.0:
51-
; CHECK-NEXT: and 10, r1, r2
52-
; CHECK-NEXT: sub.s! @CPI3_0[0], r1, r3
53-
; CHECK-NEXT: add.ge r2, r0, r1
46+
; CHECK-NEXT: sub.s! @CPI3_0[0], r1, r2
47+
; CHECK-NEXT: and.ge 10, r1, r1
5448
; CHECK-NEXT: ret
5549
%and = and i256 10, %a
5650
%cmp = icmp ult i256 %a, -5
@@ -61,10 +55,8 @@ define i256 @test_small_imm2(i256 %a) {
6155
define i256 @test_reg1(i256 %a, i256 %b) {
6256
; CHECK-LABEL: test_reg1:
6357
; CHECK: ; %bb.0:
64-
; CHECK-NEXT: and r1, r2, r3
65-
; CHECK-NEXT: sub! r1, r2, r2
66-
; CHECK-NEXT: add.ge r1, r0, r3
67-
; CHECK-NEXT: add r3, r0, r1
58+
; CHECK-NEXT: sub! r1, r2, r3
59+
; CHECK-NEXT: and.lt r1, r2, r1
6860
; CHECK-NEXT: ret
6961
%and = and i256 %a, %b
7062
%cmp = icmp ult i256 %a, %b
@@ -75,9 +67,8 @@ define i256 @test_reg1(i256 %a, i256 %b) {
7567
define i256 @test_reg2(i256 %a, i256 %b) {
7668
; CHECK-LABEL: test_reg2:
7769
; CHECK: ; %bb.0:
78-
; CHECK-NEXT: and r1, r2, r3
79-
; CHECK-NEXT: sub! r1, r2, r2
80-
; CHECK-NEXT: add.ge r3, r0, r1
70+
; CHECK-NEXT: sub! r1, r2, r3
71+
; CHECK-NEXT: and.ge r1, r2, r1
8172
; CHECK-NEXT: ret
8273
%and = and i256 %a, %b
8374
%cmp = icmp ult i256 %a, %b

llvm/test/CodeGen/EraVM/fold-div-to-select.ll

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@ target triple = "eravm"
77
define i256 @test_large_imm1(i256 %a) {
88
; CHECK-LABEL: test_large_imm1:
99
; CHECK: ; %bb.0:
10-
; CHECK-NEXT: div.s @CPI0_0[0], r1, r2, r3
11-
; CHECK-NEXT: sub.s! @CPI0_1[0], r1, r3
12-
; CHECK-NEXT: add.ge r1, r0, r2
13-
; CHECK-NEXT: add r2, r0, r1
10+
; CHECK-NEXT: sub.s! @CPI0_1[0], r1, r2
11+
; CHECK-NEXT: div.s.lt @CPI0_0[0], r1, r1, r2
1412
; CHECK-NEXT: ret
1513
%div = udiv i256 %a, 26959946660873538059280334323183841250350249843923952699046031785980
1614
%cmp = icmp ult i256 %a, -26959946660873538059280334323183841250350249843923952699046031785985
@@ -21,9 +19,8 @@ define i256 @test_large_imm1(i256 %a) {
2119
define i256 @test_large_imm2(i256 %a) {
2220
; CHECK-LABEL: test_large_imm2:
2321
; CHECK: ; %bb.0:
24-
; CHECK-NEXT: div.s @CPI1_0[0], r1, r2, r3
25-
; CHECK-NEXT: sub.s! @CPI1_1[0], r1, r3
26-
; CHECK-NEXT: add.ge r2, r0, r1
22+
; CHECK-NEXT: sub.s! @CPI1_1[0], r1, r2
23+
; CHECK-NEXT: div.s.ge @CPI1_0[0], r1, r1, r2
2724
; CHECK-NEXT: ret
2825
%div = udiv i256 %a, 26959946660873538059280334323183841250350249843923952699046031785980
2926
%cmp = icmp ult i256 %a, -26959946660873538059280334323183841250350249843923952699046031785985
@@ -34,10 +31,8 @@ define i256 @test_large_imm2(i256 %a) {
3431
define i256 @test_small_imm1(i256 %a) {
3532
; CHECK-LABEL: test_small_imm1:
3633
; CHECK: ; %bb.0:
37-
; CHECK-NEXT: div.s 10, r1, r2, r3
38-
; CHECK-NEXT: sub.s! @CPI2_0[0], r1, r3
39-
; CHECK-NEXT: add.ge r1, r0, r2
40-
; CHECK-NEXT: add r2, r0, r1
34+
; CHECK-NEXT: sub.s! @CPI2_0[0], r1, r2
35+
; CHECK-NEXT: div.s.lt 10, r1, r1, r2
4136
; CHECK-NEXT: ret
4237
%div = udiv i256 %a, 10
4338
%cmp = icmp ult i256 %a, -5
@@ -48,9 +43,8 @@ define i256 @test_small_imm1(i256 %a) {
4843
define i256 @test_small_imm2(i256 %a) {
4944
; CHECK-LABEL: test_small_imm2:
5045
; CHECK: ; %bb.0:
51-
; CHECK-NEXT: div.s 10, r1, r2, r3
52-
; CHECK-NEXT: sub.s! @CPI3_0[0], r1, r3
53-
; CHECK-NEXT: add.ge r2, r0, r1
46+
; CHECK-NEXT: sub.s! @CPI3_0[0], r1, r2
47+
; CHECK-NEXT: div.s.ge 10, r1, r1, r2
5448
; CHECK-NEXT: ret
5549
%div = udiv i256 %a, 10
5650
%cmp = icmp ult i256 %a, -5
@@ -61,10 +55,8 @@ define i256 @test_small_imm2(i256 %a) {
6155
define i256 @test_reg1(i256 %a, i256 %b) {
6256
; CHECK-LABEL: test_reg1:
6357
; CHECK: ; %bb.0:
64-
; CHECK-NEXT: div r1, r2, r3, r4
65-
; CHECK-NEXT: sub! r1, r2, r2
66-
; CHECK-NEXT: add.ge r1, r0, r3
67-
; CHECK-NEXT: add r3, r0, r1
58+
; CHECK-NEXT: sub! r1, r2, r3
59+
; CHECK-NEXT: div.lt r1, r2, r1, r2
6860
; CHECK-NEXT: ret
6961
%div = udiv i256 %a, %b
7062
%cmp = icmp ult i256 %a, %b
@@ -75,9 +67,8 @@ define i256 @test_reg1(i256 %a, i256 %b) {
7567
define i256 @test_reg2(i256 %a, i256 %b) {
7668
; CHECK-LABEL: test_reg2:
7769
; CHECK: ; %bb.0:
78-
; CHECK-NEXT: div r1, r2, r3, r4
79-
; CHECK-NEXT: sub! r1, r2, r2
80-
; CHECK-NEXT: add.ge r3, r0, r1
70+
; CHECK-NEXT: sub! r1, r2, r3
71+
; CHECK-NEXT: div.ge r1, r2, r1, r2
8172
; CHECK-NEXT: ret
8273
%div = udiv i256 %a, %b
8374
%cmp = icmp ult i256 %a, %b

0 commit comments

Comments
 (0)