Skip to content

Commit 78e9b5d

Browse files
committed
[AMDGPU] Change control flow intrinsic lowering making the wave to reconverge at the end of the predecessor block. Tests updated. Else exit mask fixed. AndTerm changed to usual And in emitLoop
1 parent 1974fac commit 78e9b5d

File tree

53 files changed

+1349
-1262
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+1349
-1262
lines changed

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

Lines changed: 92 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ class SILowerControlFlow : public MachineFunctionPass {
8686
unsigned Select;
8787
unsigned CmovOpc;
8888
unsigned AndOpc;
89+
unsigned Andn2Opc;
8990
unsigned OrOpc;
9091
unsigned XorOpc;
9192
unsigned MovTermOpc;
@@ -102,6 +103,9 @@ class SILowerControlFlow : public MachineFunctionPass {
102103
void emitWaveDiverge(MachineInstr &MI, Register EnabledLanesMask,
103104
Register DisableLanesMask);
104105

106+
void emitWaveInvert(MachineInstr &MI, Register EnabledLanesMask,
107+
Register DisableLanesMask);
108+
105109
void emitEndCf(MachineInstr &MI);
106110

107111
void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
@@ -194,7 +198,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
194198
void SILowerControlFlow::emitElse(MachineInstr &MI) {
195199
Register InvCondReg = MI.getOperand(0).getReg();
196200
Register CondReg = MI.getOperand(1).getReg();
197-
emitWaveDiverge(MI, CondReg, InvCondReg);
201+
emitWaveInvert(MI, CondReg, InvCondReg);
198202
}
199203

200204
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
@@ -260,10 +264,9 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
260264
Register MaskExit = MRI->createVirtualRegister(BoolRC);
261265
Register AndZero = MRI->createVirtualRegister(BoolRC);
262266

263-
MachineInstr *CondLoop =
264-
BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), MaskLoop)
265-
.addReg(Exec)
266-
.addReg(Cond);
267+
MachineInstr *CondLoop = BuildMI(MBB, &MI, DL, TII->get(Andn2Opc), MaskLoop)
268+
.addReg(Exec)
269+
.addReg(Cond);
267270

268271
MachineInstr *ExitExec = BuildMI(MBB, &MI, DL, TII->get(OrOpc), MaskExit)
269272
.addReg(Cond)
@@ -372,6 +375,88 @@ void SILowerControlFlow::emitWaveDiverge(MachineInstr &MI,
372375
LIS->removeAllRegUnitsForPhysReg(Exec);
373376
}
374377

378+
void SILowerControlFlow::emitWaveInvert(MachineInstr &MI,
379+
Register EnabledLanesMask,
380+
Register DisableLanesMask) {
381+
MachineBasicBlock &MBB = *MI.getParent();
382+
const DebugLoc &DL = MI.getDebugLoc();
383+
MachineBasicBlock::iterator I(MI);
384+
385+
MachineInstr *CondInverted =
386+
BuildMI(MBB, I, DL, TII->get(XorOpc), DisableLanesMask)
387+
.addReg(EnabledLanesMask)
388+
.addReg(Exec);
389+
390+
if (LV) {
391+
LV->replaceKillInstruction(DisableLanesMask, MI, *CondInverted);
392+
}
393+
394+
Register TestResultReg = MRI->createVirtualRegister(BoolRC);
395+
// If the EnableLanesMask is zero we have to restore the masked bits on the
396+
// skip way
397+
Register ExitMask = MRI->createVirtualRegister(BoolRC);
398+
MachineInstr *ExitMaskSet = BuildMI(MBB, I, DL, TII->get(OrOpc), ExitMask)
399+
.addReg(Exec)
400+
.addReg(DisableLanesMask);
401+
402+
MachineInstr *IfZeroMask =
403+
BuildMI(MBB, I, DL, TII->get(AndOpc), TestResultReg)
404+
.addReg(EnabledLanesMask)
405+
.addImm(TestMask);
406+
407+
MachineInstr *SetExecForSucc = BuildMI(MBB, I, DL, TII->get(Select), Exec)
408+
.addReg(EnabledLanesMask)
409+
.addReg(ExitMask);
410+
411+
MachineBasicBlock *FlowBB = MI.getOperand(2).getMBB();
412+
MachineBasicBlock *TargetBB = nullptr;
413+
// determine target BBs
414+
I = skipToUncondBrOrEnd(MBB, I);
415+
if (I != MBB.end()) {
416+
// skipToUncondBrOrEnd returns either unconditional branch or end()
417+
TargetBB = I->getOperand(0).getMBB();
418+
I->getOperand(0).setMBB(FlowBB);
419+
} else {
420+
// assert(MBB.succ_size() == 2);
421+
for (auto Succ : successors(&MBB)) {
422+
if (Succ != FlowBB) {
423+
TargetBB = Succ;
424+
break;
425+
}
426+
}
427+
I = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(FlowBB);
428+
if (LIS)
429+
LIS->InsertMachineInstrInMaps(*I);
430+
}
431+
432+
if (TargetBB) {
433+
MachineInstr *NewBr =
434+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)).addMBB(TargetBB);
435+
if (LIS)
436+
LIS->InsertMachineInstrInMaps(*NewBr);
437+
}
438+
439+
if (!LIS) {
440+
MI.eraseFromParent();
441+
return;
442+
}
443+
444+
LIS->InsertMachineInstrInMaps(*CondInverted);
445+
LIS->InsertMachineInstrInMaps(*ExitMaskSet);
446+
LIS->InsertMachineInstrInMaps(*IfZeroMask);
447+
LIS->ReplaceMachineInstrInMaps(MI, *SetExecForSucc);
448+
449+
RecomputeRegs.insert(MI.getOperand(0).getReg());
450+
RecomputeRegs.insert(MI.getOperand(1).getReg());
451+
452+
MI.eraseFromParent();
453+
454+
LIS->createAndComputeVirtRegInterval(TestResultReg);
455+
LIS->createAndComputeVirtRegInterval(ExitMask);
456+
457+
LIS->removeAllRegUnitsForPhysReg(Exec);
458+
}
459+
375460
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
376461

377462
MachineBasicBlock &BB = *MI.getParent();
@@ -610,6 +695,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
610695
Select = AMDGPU::S_CSELECT_B32;
611696
CmovOpc = AMDGPU::S_CMOV_B32;
612697
AndOpc = AMDGPU::S_AND_B32;
698+
Andn2Opc = AMDGPU::S_ANDN2_B32;
613699
OrOpc = AMDGPU::S_OR_B32;
614700
XorOpc = AMDGPU::S_XOR_B32;
615701
MovTermOpc = AMDGPU::S_MOV_B32_term;
@@ -623,6 +709,7 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
623709
Select = AMDGPU::S_CSELECT_B64;
624710
CmovOpc = AMDGPU::S_CMOV_B64;
625711
AndOpc = AMDGPU::S_AND_B64;
712+
Andn2Opc = AMDGPU::S_ANDN2_B64;
626713
OrOpc = AMDGPU::S_OR_B64;
627714
XorOpc = AMDGPU::S_XOR_B64;
628715
MovTermOpc = AMDGPU::S_MOV_B64_term;

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
117117
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
118118
; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
119119
; GFX10-NEXT: s_or_b32 s6, s6, s4
120-
; GFX10-NEXT: s_xor_b32 s4, s5, exec_lo
120+
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5
121121
; GFX10-NEXT: s_or_b32 s7, s5, exec_lo
122122
; GFX10-NEXT: s_and_b32 s8, s4, -1
123123
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s7
@@ -165,7 +165,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
165165
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
166166
; GFX10-NEXT: s_and_b32 s7, exec_lo, s4
167167
; GFX10-NEXT: s_or_b32 s6, s6, s7
168-
; GFX10-NEXT: s_xor_b32 s7, s5, exec_lo
168+
; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s5
169169
; GFX10-NEXT: s_or_b32 s8, s5, exec_lo
170170
; GFX10-NEXT: s_and_b32 s9, s7, -1
171171
; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s8

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
3333
; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
3434
; GFX10-NEXT: s_or_b32 s7, s8, s7
3535
; GFX10-NEXT: s_or_b32 s5, s5, s6
36-
; GFX10-NEXT: s_xor_b32 s8, s4, exec_lo
36+
; GFX10-NEXT: s_andn2_b32 s8, exec_lo, s4
3737
; GFX10-NEXT: s_mov_b32 s6, s7
3838
; GFX10-NEXT: s_or_b32 s7, s4, exec_lo
3939
; GFX10-NEXT: s_and_b32 s9, s8, -1
@@ -156,7 +156,7 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
156156
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
157157
; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
158158
; GFX10-NEXT: s_or_b32 s6, s6, s7
159-
; GFX10-NEXT: s_xor_b32 s7, s4, exec_lo
159+
; GFX10-NEXT: s_andn2_b32 s7, exec_lo, s4
160160
; GFX10-NEXT: s_or_b32 s8, s4, exec_lo
161161
; GFX10-NEXT: s_and_b32 s9, s7, -1
162162
; GFX10-NEXT: s_cselect_b32 exec_lo, s7, s8
@@ -220,7 +220,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
220220
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
221221
; GFX10-NEXT: s_and_b32 s9, exec_lo, s9
222222
; GFX10-NEXT: s_or_b32 s6, s6, s9
223-
; GFX10-NEXT: s_xor_b32 s9, s5, exec_lo
223+
; GFX10-NEXT: s_andn2_b32 s9, exec_lo, s5
224224
; GFX10-NEXT: s_or_b32 s10, s5, exec_lo
225225
; GFX10-NEXT: s_and_b32 s11, s9, -1
226226
; GFX10-NEXT: s_cselect_b32 exec_lo, s9, s10
@@ -325,7 +325,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
325325
; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
326326
; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
327327
; GFX10-NEXT: s_or_b32 s6, s4, s6
328-
; GFX10-NEXT: s_xor_b32 s4, s5, exec_lo
328+
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5
329329
; GFX10-NEXT: s_or_b32 s7, s5, exec_lo
330330
; GFX10-NEXT: s_and_b32 s8, s4, -1
331331
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s7
@@ -450,7 +450,7 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
450450
; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
451451
; GFX10-NEXT: s_or_b32 s3, s3, s4
452452
; GFX10-NEXT: s_or_b32 s1, s1, s4
453-
; GFX10-NEXT: s_xor_b32 s4, s0, exec_lo
453+
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0
454454
; GFX10-NEXT: s_or_b32 s5, s0, exec_lo
455455
; GFX10-NEXT: s_and_b32 s6, s4, -1
456456
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s5
@@ -528,7 +528,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
528528
; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
529529
; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
530530
; GFX10-NEXT: s_or_b32 s1, s1, s4
531-
; GFX10-NEXT: s_xor_b32 s4, s0, exec_lo
531+
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0
532532
; GFX10-NEXT: s_or_b32 s5, s0, exec_lo
533533
; GFX10-NEXT: s_and_b32 s6, s4, -1
534534
; GFX10-NEXT: s_waitcnt_depctr 0xffe3

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
123123
; GFX10-NEXT: ; in Loop: Header=BB2_2 Depth=1
124124
; GFX10-NEXT: s_and_b32 s2, exec_lo, s1
125125
; GFX10-NEXT: s_or_b32 s0, s2, s0
126-
; GFX10-NEXT: s_xor_b32 s2, s0, exec_lo
126+
; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0
127127
; GFX10-NEXT: s_or_b32 s3, s0, exec_lo
128128
; GFX10-NEXT: s_and_b32 s4, s2, -1
129129
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
@@ -206,7 +206,7 @@ define amdgpu_cs void @loop_with_2breaks(ptr addrspace(1) %x, ptr addrspace(1) %
206206
; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
207207
; GFX10-NEXT: s_and_b32 s2, exec_lo, s1
208208
; GFX10-NEXT: s_or_b32 s0, s2, s0
209-
; GFX10-NEXT: s_xor_b32 s2, s0, exec_lo
209+
; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0
210210
; GFX10-NEXT: s_or_b32 s3, s0, exec_lo
211211
; GFX10-NEXT: s_and_b32 s4, s2, -1
212212
; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s3
@@ -313,7 +313,7 @@ define amdgpu_cs void @loop_with_3breaks(ptr addrspace(1) %x, ptr addrspace(1) %
313313
; GFX10-NEXT: ; in Loop: Header=BB4_4 Depth=1
314314
; GFX10-NEXT: s_and_b32 s2, exec_lo, s1
315315
; GFX10-NEXT: s_or_b32 s0, s2, s0
316-
; GFX10-NEXT: s_xor_b32 s2, s0, exec_lo
316+
; GFX10-NEXT: s_andn2_b32 s2, exec_lo, s0
317317
; GFX10-NEXT: s_or_b32 s3, s0, exec_lo
318318
; GFX10-NEXT: s_and_b32 s4, s2, -1
319319
; GFX10-NEXT: s_cselect_b32 exec_lo, s2, s3
@@ -435,7 +435,7 @@ define amdgpu_cs void @loop_with_div_break_with_body(ptr addrspace(1) %x, ptr ad
435435
; GFX10-NEXT: s_andn2_b32 s1, s1, exec_lo
436436
; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
437437
; GFX10-NEXT: s_or_b32 s1, s1, s4
438-
; GFX10-NEXT: s_xor_b32 s4, s0, exec_lo
438+
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s0
439439
; GFX10-NEXT: s_or_b32 s5, s0, exec_lo
440440
; GFX10-NEXT: s_and_b32 s6, s4, -1
441441
; GFX10-NEXT: s_waitcnt_depctr 0xffe3

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
2121
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
2222
; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
2323
; GFX10-NEXT: s_or_b32 s6, s6, s4
24-
; GFX10-NEXT: s_xor_b32 s4, s5, exec_lo
24+
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5
2525
; GFX10-NEXT: s_or_b32 s7, s5, exec_lo
2626
; GFX10-NEXT: s_and_b32 s8, s4, -1
2727
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s7
@@ -69,7 +69,7 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) {
6969
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
7070
; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
7171
; GFX10-NEXT: s_or_b32 s6, s6, s4
72-
; GFX10-NEXT: s_xor_b32 s4, s5, exec_lo
72+
; GFX10-NEXT: s_andn2_b32 s4, exec_lo, s5
7373
; GFX10-NEXT: s_or_b32 s7, s5, exec_lo
7474
; GFX10-NEXT: s_and_b32 s8, s4, -1
7575
; GFX10-NEXT: s_cselect_b32 exec_lo, s4, s7
@@ -133,7 +133,7 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, i32 %x.size, ptr ad
133133
; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo
134134
; GFX10-NEXT: s_and_b32 s5, exec_lo, s5
135135
; GFX10-NEXT: s_or_b32 s0, s0, s5
136-
; GFX10-NEXT: s_xor_b32 s5, s4, exec_lo
136+
; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4
137137
; GFX10-NEXT: s_or_b32 s6, s4, exec_lo
138138
; GFX10-NEXT: s_and_b32 s7, s5, -1
139139
; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s6

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ define void @temporal_divergent_i32(float %val, ptr %addr) {
1414
; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3
1515
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
1616
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
17-
; GFX10-NEXT: s_xor_b32 s5, s4, exec_lo
17+
; GFX10-NEXT: s_andn2_b32 s5, exec_lo, s4
1818
; GFX10-NEXT: s_or_b32 s6, s4, exec_lo
1919
; GFX10-NEXT: s_and_b32 s7, s5, -1
2020
; GFX10-NEXT: s_cselect_b32 exec_lo, s5, s6

llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
224224
; CHECK-NEXT: ; in Loop: Header=BB5_3 Depth=1
225225
; CHECK-NEXT: s_and_b64 s[4:5], exec, s[2:3]
226226
; CHECK-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
227-
; CHECK-NEXT: s_xor_b64 s[4:5], s[0:1], exec
227+
; CHECK-NEXT: s_andn2_b64 s[4:5], exec, s[0:1]
228228
; CHECK-NEXT: s_or_b64 s[6:7], s[0:1], exec
229229
; CHECK-NEXT: s_and_b64 s[8:9], s[4:5], -1
230230
; CHECK-NEXT: s_cselect_b64 exec, s[4:5], s[6:7]

0 commit comments

Comments
 (0)