Skip to content

Commit a80ebd0

Browse files
committed
[AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization
Frame-base materialization may insert vector instructions before EXEC is initialised. Fix this by moving lowering of llvm.amdgcn.init.exec later in backend. Also remove SI_INIT_EXEC_LO pseudo as this is not necessary. Reviewed By: ruiling Differential Revision: https://reviews.llvm.org/D94645
1 parent afd483e commit a80ebd0

File tree

5 files changed

+207
-90
lines changed

5 files changed

+207
-90
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ def int_amdgcn_init_exec : Intrinsic<[],
182182
// Set EXEC according to a thread count packed in an SGPR input:
183183
// thread_count = (input >> bitoffset) & 0x7f;
184184
// This is always moved to the beginning of the basic block.
185+
// Note: only inreg arguments to the parent function are valid as
186+
// inputs to this intrinsic, computed values cannot be used.
185187
def int_amdgcn_init_exec_from_input : Intrinsic<[],
186188
[llvm_i32_ty, // 32-bit SGPR input
187189
llvm_i32_ty], // bit offset of the thread count

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -4021,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
40214021
MI.eraseFromParent();
40224022
return BB;
40234023
}
4024-
case AMDGPU::SI_INIT_EXEC:
4025-
// This should be before all vector instructions.
4026-
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
4027-
AMDGPU::EXEC)
4028-
.addImm(MI.getOperand(0).getImm());
4029-
MI.eraseFromParent();
4030-
return BB;
4031-
4032-
case AMDGPU::SI_INIT_EXEC_LO:
4033-
// This should be before all vector instructions.
4034-
BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
4035-
AMDGPU::EXEC_LO)
4036-
.addImm(MI.getOperand(0).getImm());
4037-
MI.eraseFromParent();
4038-
return BB;
4039-
4040-
case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
4041-
// Extract the thread count from an SGPR input and set EXEC accordingly.
4042-
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
4043-
//
4044-
// S_BFE_U32 count, input, {shift, 7}
4045-
// S_BFM_B64 exec, count, 0
4046-
// S_CMP_EQ_U32 count, 64
4047-
// S_CMOV_B64 exec, -1
4048-
MachineInstr *FirstMI = &*BB->begin();
4049-
MachineRegisterInfo &MRI = MF->getRegInfo();
4050-
Register InputReg = MI.getOperand(0).getReg();
4051-
Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4052-
bool Found = false;
4053-
4054-
// Move the COPY of the input reg to the beginning, so that we can use it.
4055-
for (auto I = BB->begin(); I != &MI; I++) {
4056-
if (I->getOpcode() != TargetOpcode::COPY ||
4057-
I->getOperand(0).getReg() != InputReg)
4058-
continue;
4059-
4060-
if (I == FirstMI) {
4061-
FirstMI = &*++BB->begin();
4062-
} else {
4063-
I->removeFromParent();
4064-
BB->insert(FirstMI, &*I);
4065-
}
4066-
Found = true;
4067-
break;
4068-
}
4069-
assert(Found);
4070-
(void)Found;
4071-
4072-
// This should be before all vector instructions.
4073-
unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
4074-
bool isWave32 = getSubtarget()->isWave32();
4075-
unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4076-
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
4077-
.addReg(InputReg)
4078-
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
4079-
BuildMI(*BB, FirstMI, DebugLoc(),
4080-
TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
4081-
Exec)
4082-
.addReg(CountReg)
4083-
.addImm(0);
4084-
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
4085-
.addReg(CountReg, RegState::Kill)
4086-
.addImm(getSubtarget()->getWavefrontSize());
4087-
BuildMI(*BB, FirstMI, DebugLoc(),
4088-
TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
4089-
Exec)
4090-
.addImm(-1);
4091-
MI.eraseFromParent();
4092-
return BB;
4093-
}
4094-
40954024
case AMDGPU::GET_GROUPSTATICSIZE: {
40964025
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
40974026
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -399,32 +399,13 @@ def SI_INIT_EXEC : SPseudoInstSI <
399399
(outs), (ins i64imm:$src),
400400
[(int_amdgcn_init_exec (i64 timm:$src))]> {
401401
let Defs = [EXEC];
402-
let usesCustomInserter = 1;
403-
let isAsCheapAsAMove = 1;
404-
let WaveSizePredicate = isWave64;
405-
}
406-
407-
// FIXME: Intrinsic should be mangled for wave size.
408-
def SI_INIT_EXEC_LO : SPseudoInstSI <
409-
(outs), (ins i32imm:$src), []> {
410-
let Defs = [EXEC_LO];
411-
let usesCustomInserter = 1;
412402
let isAsCheapAsAMove = 1;
413-
let WaveSizePredicate = isWave32;
414403
}
415404

416-
// FIXME: Wave32 version
417405
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
418406
(outs), (ins SSrc_b32:$input, i32imm:$shift),
419407
[(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
420408
let Defs = [EXEC];
421-
let usesCustomInserter = 1;
422-
}
423-
424-
def : GCNPat <
425-
(int_amdgcn_init_exec timm:$src),
426-
(SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
427-
let WaveSizePredicate = isWave32;
428409
}
429410

430411
// Return for returning shaders to a shader variant epilog.

llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ class SILowerControlFlow : public MachineFunctionPass {
9393

9494
MachineBasicBlock *emitEndCf(MachineInstr &MI);
9595

96+
void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
97+
9698
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
9799
SmallVectorImpl<MachineOperand> &Src) const;
98100

@@ -661,6 +663,90 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
661663
return SplitBB;
662664
}
663665

666+
void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
667+
MachineInstr &MI) {
668+
MachineFunction &MF = *MBB->getParent();
669+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
670+
bool IsWave32 = ST.isWave32();
671+
672+
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
673+
// This should be before all vector instructions.
674+
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
675+
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
676+
.addImm(MI.getOperand(0).getImm());
677+
if (LIS)
678+
LIS->RemoveMachineInstrFromMaps(MI);
679+
MI.eraseFromParent();
680+
return;
681+
}
682+
683+
// Extract the thread count from an SGPR input and set EXEC accordingly.
684+
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
685+
//
686+
// S_BFE_U32 count, input, {shift, 7}
687+
// S_BFM_B64 exec, count, 0
688+
// S_CMP_EQ_U32 count, 64
689+
// S_CMOV_B64 exec, -1
690+
Register InputReg = MI.getOperand(0).getReg();
691+
MachineInstr *FirstMI = &*MBB->begin();
692+
if (InputReg.isVirtual()) {
693+
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
694+
assert(DefInstr && DefInstr->isCopy());
695+
if (DefInstr->getParent() == MBB) {
696+
if (DefInstr != FirstMI) {
697+
// If the `InputReg` is defined in current block, we also need to
698+
// move that instruction to the beginning of the block.
699+
DefInstr->removeFromParent();
700+
MBB->insert(FirstMI, DefInstr);
701+
if (LIS)
702+
LIS->handleMove(*DefInstr);
703+
} else {
704+
// If first instruction is definition then move pointer after it.
705+
FirstMI = &*std::next(FirstMI->getIterator());
706+
}
707+
}
708+
}
709+
710+
// Insert instruction sequence at block beginning (before vector operations).
711+
const DebugLoc DL = MI.getDebugLoc();
712+
const unsigned WavefrontSize = ST.getWavefrontSize();
713+
const unsigned Mask = (WavefrontSize << 1) - 1;
714+
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
715+
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
716+
.addReg(InputReg)
717+
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
718+
auto BfmMI =
719+
BuildMI(*MBB, FirstMI, DL,
720+
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
721+
.addReg(CountReg)
722+
.addImm(0);
723+
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
724+
.addReg(CountReg, RegState::Kill)
725+
.addImm(WavefrontSize);
726+
auto CmovMI =
727+
BuildMI(*MBB, FirstMI, DL,
728+
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
729+
Exec)
730+
.addImm(-1);
731+
732+
if (!LIS) {
733+
MI.eraseFromParent();
734+
return;
735+
}
736+
737+
LIS->RemoveMachineInstrFromMaps(MI);
738+
MI.eraseFromParent();
739+
740+
LIS->InsertMachineInstrInMaps(*BfeMI);
741+
LIS->InsertMachineInstrInMaps(*BfmMI);
742+
LIS->InsertMachineInstrInMaps(*CmpMI);
743+
LIS->InsertMachineInstrInMaps(*CmovMI);
744+
745+
LIS->removeInterval(InputReg);
746+
LIS->createAndComputeVirtRegInterval(InputReg);
747+
LIS->createAndComputeVirtRegInterval(CountReg);
748+
}
749+
664750
bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
665751
auto GetFallThroughSucc = [=](MachineBasicBlock *B) -> MachineBasicBlock * {
666752
auto *S = B->getNextNode();
@@ -781,6 +867,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
781867
SplitMBB = process(MI);
782868
break;
783869

870+
// FIXME: find a better place for this
871+
case AMDGPU::SI_INIT_EXEC:
872+
case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
873+
lowerInitExec(MBB, MI);
874+
if (LIS)
875+
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
876+
break;
877+
784878
default:
785879
break;
786880
}

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,117 @@ main_body:
8484
unreachable
8585
}
8686

87+
; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
88+
; GCN-NOT: {{^}}v_
89+
; GCN: s_mov_b64 exec, -1
90+
; GCN: v_mov
91+
; GCN: v_add
92+
define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
93+
main_body:
94+
%array0 = alloca [1024 x i32], align 16, addrspace(5)
95+
%array1 = alloca [20 x i32], align 16, addrspace(5)
96+
call void @llvm.amdgcn.init.exec(i64 -1)
97+
98+
%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
99+
store i32 %a, i32 addrspace(5)* %ptr0, align 4
100+
101+
%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
102+
store i32 %a, i32 addrspace(5)* %ptr1, align 4
103+
104+
%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
105+
store i32 %b, i32 addrspace(5)* %ptr2, align 4
106+
107+
%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
108+
%v3 = load i32, i32 addrspace(5)* %ptr3, align 4
109+
110+
%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
111+
%v4 = load i32, i32 addrspace(5)* %ptr4, align 4
112+
113+
%v5 = add i32 %v3, %v4
114+
%v = bitcast i32 %v5 to float
115+
ret float %v
116+
}
117+
118+
; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
119+
; GCN-NOT: {{^}}v_
120+
; GCN: s_bfe_u32 s2, s2, 0x70008
121+
; GCN-NEXT: s_bfm_b64 exec, s2, 0
122+
; GCN-NEXT: s_cmp_eq_u32 s2, 64
123+
; GCN-NEXT: s_cmov_b64 exec, -1
124+
; GCN: v_mov
125+
; GCN: v_add
126+
define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
127+
main_body:
128+
%array0 = alloca [1024 x i32], align 16, addrspace(5)
129+
%array1 = alloca [20 x i32], align 16, addrspace(5)
130+
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
131+
132+
%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
133+
store i32 %a, i32 addrspace(5)* %ptr0, align 4
134+
135+
%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
136+
store i32 %a, i32 addrspace(5)* %ptr1, align 4
137+
138+
%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
139+
store i32 %b, i32 addrspace(5)* %ptr2, align 4
140+
141+
%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
142+
%v3 = load i32, i32 addrspace(5)* %ptr3, align 4
143+
144+
%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
145+
%v4 = load i32, i32 addrspace(5)* %ptr4, align 4
146+
147+
%v5 = add i32 %v3, %v4
148+
%v = bitcast i32 %v5 to float
149+
ret float %v
150+
}
151+
152+
; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
153+
; GCN-NOT: {{^}}v_
154+
; GCN: %endif
155+
; GCN: s_bfe_u32 s3, s2, 0x70008
156+
; GCN-NEXT: s_bfm_b64 exec, s3, 0
157+
; GCN-NEXT: s_cmp_eq_u32 s3, 64
158+
; GCN-NEXT: s_cmov_b64 exec, -1
159+
; GCN: v_mov
160+
; GCN: v_add
161+
define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
162+
main_body:
163+
; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
164+
%array0 = alloca [1024 x i32], align 16, addrspace(5)
165+
%array1 = alloca [20 x i32], align 16, addrspace(5)
166+
167+
%cc = icmp uge i32 %count, 32
168+
br i1 %cc, label %endif, label %if
169+
170+
if:
171+
call void asm sideeffect "", ""()
172+
br label %endif
173+
174+
endif:
175+
call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
176+
177+
%ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
178+
store i32 %a, i32 addrspace(5)* %ptr0, align 4
179+
180+
%ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
181+
store i32 %a, i32 addrspace(5)* %ptr1, align 4
182+
183+
%ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
184+
store i32 %b, i32 addrspace(5)* %ptr2, align 4
185+
186+
%ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
187+
%v3 = load i32, i32 addrspace(5)* %ptr3, align 4
188+
189+
%ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
190+
%v4 = load i32, i32 addrspace(5)* %ptr4, align 4
191+
192+
%v5 = add i32 %v3, %v4
193+
%v6 = add i32 %v5, %count
194+
%v = bitcast i32 %v6 to float
195+
ret float %v
196+
}
197+
87198
declare void @llvm.amdgcn.init.exec(i64) #1
88199
declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
89200

0 commit comments

Comments
 (0)