Skip to content

Commit b51d4b7

Browse files
committed
[AMDGPU] Add llvm.amdgcn.dead intrinsic
Shaders that use the llvm.amdgcn.init.whole.wave intrinsic need to explicitly preserve the inactive lanes of VGPRs of interest by adding them as dummy arguments. The code usually looks something like this: ``` define amdgcn_cs_chain void f(active vgpr args..., i32 %inactive.vgpr1, ..., i32 %inactive.vgprN) { entry: %c = call i1 @llvm.amdgcn.init.whole.wave() br i1 %c, label %shader, label %tail shader: [...] tail: %inactive.vgpr.arg1 = phi i32 [ %inactive.vgpr1, %entry], [poison, %shader] [...] ; %inactive.vgpr* then get passed into a llvm.amdgcn.cs.chain call ``` Unfortunately, this kind of phi node will get optimized away and the backend won't be able to figure out that it's ok to use the active lanes of `%inactive.vgpr*` inside `shader`. This patch fixes the issue by introducing a llvm.amdgcn.dead intrinsic, whose result can be used inside the PHI instead of the poison. This will be selected to an IMPLICIT_DEF, which the backend can work with. At the moment, the llvm.amdgcn.dead intrinsic works only on i32 values. Support for other types can be added later if needed.
1 parent 9a44d96 commit b51d4b7

File tree

5 files changed

+46
-21
lines changed

5 files changed

+46
-21
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3450,4 +3450,11 @@ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
34503450
[llvm_anyptr_ty], [llvm_anyptr_ty],
34513451
[IntrNoMem, IntrSpeculatable]
34523452
>;
3453+
3454+
/// Make it clear to the backend that this value is really dead. For instance,
3455+
/// when used as an input to a phi node, it will make it possible for the
3456+
/// backend to allocate the dead lanes for operations within the corresponding
3457+
/// incoming block.
3458+
def int_amdgcn_dead: DefaultAttrsIntrinsic<[llvm_i32_ty], [],
3459+
[IntrNoMem, IntrWillReturn, IntrNoCallback]>;
34533460
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1190,6 +1190,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
11901190
case Intrinsic::amdgcn_permlane16_swap:
11911191
case Intrinsic::amdgcn_permlane32_swap:
11921192
return selectPermlaneSwapIntrin(I, IntrinsicID);
1193+
case Intrinsic::amdgcn_dead: {
1194+
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
1195+
I.removeOperand(1); // drop intrinsic ID
1196+
return RBI.constrainGenericRegister(I.getOperand(0).getReg(),
1197+
AMDGPU::VGPR_32RegClass, *MRI);
1198+
}
11931199
default:
11941200
return selectImpl(I, *CoverageInfo);
11951201
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4676,6 +4676,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
46764676
case Intrinsic::amdgcn_set_inactive_chain_arg:
46774677
case Intrinsic::amdgcn_permlane64:
46784678
case Intrinsic::amdgcn_ds_bpermute_fi_b32:
4679+
case Intrinsic::amdgcn_dead:
46794680
return getDefaultMappingAllVGPR(MI);
46804681
case Intrinsic::amdgcn_cvt_pkrtz:
46814682
if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4268,3 +4268,9 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
42684268
let hasSideEffects = 1;
42694269
let SubtargetPredicate = isGFX10Plus;
42704270
}
4271+
4272+
// FIXME: Would be nice if we could set the register class for the destination
4273+
// register too.
4274+
def IMP_DEF_FROM_INTRINSIC: Pat<
4275+
(i32 (int_amdgcn_dead)), (IMPLICIT_DEF)>;
4276+

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,8 +1118,8 @@ tail:
11181118
; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
11191119
; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
11201120
; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
1121-
; it clear to the backend that it's safe to allocate v9 inside shader.
1122-
; FIXME: Using poison is not clear enough.
1121+
; it clear to the backend that it's safe to allocate v9's active lanes inside
1122+
; shader. This is achieved by using the llvm.amdgcn.dead intrinsic.
11231123
define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, i32 %active.vgpr, i32 %inactive.vgpr) {
11241124
; GISEL12-LABEL: with_inactive_vgprs:
11251125
; GISEL12: ; %bb.0: ; %entry
@@ -1136,14 +1136,15 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
11361136
; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
11371137
; GISEL12-NEXT: s_cbranch_execz .LBB6_2
11381138
; GISEL12-NEXT: ; %bb.1: ; %shader
1139-
; GISEL12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
1140-
; GISEL12-NEXT: flat_load_b32 v12, v[10:11]
1139+
; GISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1140+
; GISEL12-NEXT: flat_load_b32 v11, v[9:10]
11411141
; GISEL12-NEXT: ;;#ASMSTART
11421142
; GISEL12-NEXT: ; use v0-7
11431143
; GISEL12-NEXT: ;;#ASMEND
11441144
; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1145-
; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v12
1146-
; GISEL12-NEXT: flat_store_b32 v[10:11], v12
1145+
; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1146+
; GISEL12-NEXT: flat_store_b32 v[9:10], v11
1147+
; GISEL12-NEXT: ; implicit-def: $vgpr9
11471148
; GISEL12-NEXT: .LBB6_2: ; %tail.block
11481149
; GISEL12-NEXT: s_wait_alu 0xfffe
11491150
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
@@ -1165,14 +1166,15 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
11651166
; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
11661167
; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
11671168
; DAGISEL12-NEXT: ; %bb.1: ; %shader
1168-
; DAGISEL12-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
1169-
; DAGISEL12-NEXT: flat_load_b32 v12, v[10:11]
1169+
; DAGISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
1170+
; DAGISEL12-NEXT: flat_load_b32 v11, v[9:10]
11701171
; DAGISEL12-NEXT: ;;#ASMSTART
11711172
; DAGISEL12-NEXT: ; use v0-7
11721173
; DAGISEL12-NEXT: ;;#ASMEND
11731174
; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
1174-
; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v12
1175-
; DAGISEL12-NEXT: flat_store_b32 v[10:11], v12
1175+
; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
1176+
; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11
1177+
; DAGISEL12-NEXT: ; implicit-def: $vgpr9
11761178
; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
11771179
; DAGISEL12-NEXT: s_wait_alu 0xfffe
11781180
; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
@@ -1191,15 +1193,16 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
11911193
; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
11921194
; GISEL10-NEXT: s_cbranch_execz .LBB6_2
11931195
; GISEL10-NEXT: ; %bb.1: ; %shader
1194-
; GISEL10-NEXT: v_mov_b32_e32 v11, s5
1195-
; GISEL10-NEXT: v_mov_b32_e32 v10, s4
1196-
; GISEL10-NEXT: flat_load_dword v12, v[10:11]
1196+
; GISEL10-NEXT: v_mov_b32_e32 v10, s5
1197+
; GISEL10-NEXT: v_mov_b32_e32 v9, s4
1198+
; GISEL10-NEXT: flat_load_dword v11, v[9:10]
11971199
; GISEL10-NEXT: ;;#ASMSTART
11981200
; GISEL10-NEXT: ; use v0-7
11991201
; GISEL10-NEXT: ;;#ASMEND
12001202
; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1201-
; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v12
1202-
; GISEL10-NEXT: flat_store_dword v[10:11], v12
1203+
; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1204+
; GISEL10-NEXT: flat_store_dword v[9:10], v11
1205+
; GISEL10-NEXT: ; implicit-def: $vgpr9
12031206
; GISEL10-NEXT: .LBB6_2: ; %tail.block
12041207
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
12051208
; GISEL10-NEXT: s_mov_b32 exec_lo, s2
@@ -1214,21 +1217,23 @@ define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %e
12141217
; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
12151218
; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
12161219
; DAGISEL10-NEXT: ; %bb.1: ; %shader
1217-
; DAGISEL10-NEXT: v_mov_b32_e32 v11, s5
1218-
; DAGISEL10-NEXT: v_mov_b32_e32 v10, s4
1219-
; DAGISEL10-NEXT: flat_load_dword v12, v[10:11]
1220+
; DAGISEL10-NEXT: v_mov_b32_e32 v10, s5
1221+
; DAGISEL10-NEXT: v_mov_b32_e32 v9, s4
1222+
; DAGISEL10-NEXT: flat_load_dword v11, v[9:10]
12201223
; DAGISEL10-NEXT: ;;#ASMSTART
12211224
; DAGISEL10-NEXT: ; use v0-7
12221225
; DAGISEL10-NEXT: ;;#ASMEND
12231226
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1224-
; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v12
1225-
; DAGISEL10-NEXT: flat_store_dword v[10:11], v12
1227+
; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
1228+
; DAGISEL10-NEXT: flat_store_dword v[9:10], v11
1229+
; DAGISEL10-NEXT: ; implicit-def: $vgpr9
12261230
; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
12271231
; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
12281232
; DAGISEL10-NEXT: s_mov_b32 s0, s3
12291233
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
12301234
; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
12311235
entry:
1236+
%0 = call i32 @llvm.amdgcn.dead()
12321237
%1 = call i1 @llvm.amdgcn.init.whole.wave()
12331238
br i1 %1, label %shader, label %tail.block
12341239

@@ -1241,7 +1246,7 @@ shader: ; preds = %entry
12411246

12421247
tail.block: ; preds = %.exit27, %.exit49, %244, %243, %entry
12431248
%active.vgpr.arg = phi i32 [ %active.vgpr, %entry ], [ %active.vgpr.new, %shader ]
1244-
%inactive.vgpr.arg = phi i32 [ %inactive.vgpr, %entry ], [ poison, %shader ]
1249+
%inactive.vgpr.arg = phi i32 [ %inactive.vgpr, %entry ], [ %0, %shader ]
12451250
%vgprs.0 = insertvalue { i32, i32 } poison, i32 %active.vgpr.arg, 0
12461251
%vgprs = insertvalue { i32, i32 } %vgprs.0, i32 %inactive.vgpr.arg, 1
12471252
call void (ptr, i32, i32, { i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, { i32, i32} %vgprs, i32 0)

0 commit comments

Comments
 (0)