Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3450,4 +3450,11 @@ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
[llvm_anyptr_ty], [llvm_anyptr_ty],
[IntrNoMem, IntrSpeculatable]
>;

/// Make it clear to the backend that this value is really dead. For instance,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So is this just convergent poison? Why isn't it IntrConvergent?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because in my current usage it's ok for it to be sunk from the entry block into shader. In my very limited understanding, convergent would preclude that (but do correct me if I'm wrong).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But is hosting correct? In the future with convergence tokens this will be explicit for where movement can occur

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think hoisting should be correct too.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, hoisting is semantically correct, just undesired in practice.

/// when used as an input to a phi node, it will make it possible for the
/// backend to allocate the dead lanes for operations within the corresponding
/// incoming block.
def int_amdgcn_dead: DefaultAttrsIntrinsic<[llvm_any_ty], [],
[IntrNoMem, IntrWillReturn, IntrNoCallback]>;
}
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
case Intrinsic::amdgcn_permlane16_swap:
case Intrinsic::amdgcn_permlane32_swap:
return selectPermlaneSwapIntrin(I, IntrinsicID);
case Intrinsic::amdgcn_dead: {
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
I.removeOperand(1); // drop intrinsic ID
return RBI.constrainGenericRegister(I.getOperand(0).getReg(),
AMDGPU::VGPR_32RegClass, *MRI);
}
default:
return selectImpl(I, *CoverageInfo);
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4676,6 +4676,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_ds_bpermute_fi_b32:
case Intrinsic::amdgcn_dead:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_cvt_pkrtz:
if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,8 @@ def : SourceOfDivergence<int_amdgcn_inverse_ballot>;
foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;

def : SourceOfDivergence<int_amdgcn_dead>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs test in test/Analysis/UniformityAnalysis


class AlwaysUniform<Intrinsic intr> {
Intrinsic Intr = intr;
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -4268,3 +4268,9 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
let hasSideEffects = 1;
let SubtargetPredicate = isGFX10Plus;
}

// FIXME: Would be nice if we could set the register class for the destination
// register too.
def IMP_DEF_FROM_INTRINSIC: Pat<
(i32 (int_amdgcn_dead)), (IMPLICIT_DEF)>;

9 changes: 8 additions & 1 deletion llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,12 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i
ret void
}


; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.dead.i32()
define amdgpu_cs_chain void @dead(ptr addrspace(1) %out) {
%v = call i32 @llvm.amdgcn.dead.i32()
store i32 %v, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
Expand Down Expand Up @@ -558,5 +563,7 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1))
declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1))
declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1))

declare i32 @llvm.amdgcn.dead.i32()

attributes #0 = { nounwind convergent }
attributes #1 = { nounwind readnone convergent }
64 changes: 64 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-DAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-GISEL %s

; Test that we can use v0 for temporaries in the if.then block.
define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) #0 {
; ASM-DAG-LABEL: dead:
; ASM-DAG: ; %bb.0: ; %entry
; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0
; ASM-DAG-NEXT: s_wait_expcnt 0x0
; ASM-DAG-NEXT: s_wait_samplecnt 0x0
; ASM-DAG-NEXT: s_wait_bvhcnt 0x0
; ASM-DAG-NEXT: s_wait_kmcnt 0x0
; ASM-DAG-NEXT: v_mov_b32_e32 v4, v0
; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1
; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo
; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v4
; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1
; ASM-DAG-NEXT: s_cbranch_execz .LBB0_2
; ASM-DAG-NEXT: ; %bb.1: ; %if.then
; ASM-DAG-NEXT: v_add_nc_u32_e32 v0, 1, v0
; ASM-DAG-NEXT: global_store_b32 v[2:3], v0, off
; ASM-DAG-NEXT: ; implicit-def: $vgpr0
; ASM-DAG-NEXT: .LBB0_2: ; %if.end
; ASM-DAG-NEXT: s_wait_alu 0xfffe
; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
; ASM-DAG-NEXT: s_setpc_b64 s[30:31]
;
; ASM-GISEL-LABEL: dead:
; ASM-GISEL: ; %bb.0: ; %entry
; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; ASM-GISEL-NEXT: s_wait_expcnt 0x0
; ASM-GISEL-NEXT: s_wait_samplecnt 0x0
; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0
; ASM-GISEL-NEXT: s_wait_kmcnt 0x0
; ASM-GISEL-NEXT: v_mov_b32_e32 v4, v0
; ASM-GISEL-NEXT: v_mov_b32_e32 v0, v1
; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo
; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; ASM-GISEL-NEXT: v_and_b32_e32 v1, 1, v4
; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v1
; ASM-GISEL-NEXT: s_cbranch_execz .LBB0_2
; ASM-GISEL-NEXT: ; %bb.1: ; %if.then
; ASM-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0
; ASM-GISEL-NEXT: global_store_b32 v[2:3], v0, off
; ASM-GISEL-NEXT: ; implicit-def: $vgpr0
; ASM-GISEL-NEXT: .LBB0_2: ; %if.end
; ASM-GISEL-NEXT: s_wait_alu 0xfffe
; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; ASM-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%dead = call i32 @llvm.amdgcn.dead.i32()
br i1 %cond, label %if.then, label %if.end

if.then: ; preds = %entry
%temp = add i32 %x, 1
store i32 %temp, ptr addrspace(1) %ptr1
br label %if.end

if.end:
%res = phi i32 [ %x, %entry ], [ %dead, %if.then ]
ret i32 %res
}
137 changes: 137 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1115,4 +1115,141 @@ tail:
unreachable
}

; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive
; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding
; dummy VGPR arguments. Since only the inactive lanes are important, we need to make
; it clear to the backend that it's safe to allocate v9's active lanes inside
; shader. This is achieved by using the llvm.amdgcn.dead intrinsic.
define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, i32 %active.vgpr, i32 %inactive.vgpr) {
; GISEL12-LABEL: with_inactive_vgprs:
; GISEL12: ; %bb.0: ; %entry
; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL12-NEXT: s_wait_expcnt 0x0
; GISEL12-NEXT: s_wait_samplecnt 0x0
; GISEL12-NEXT: s_wait_bvhcnt 0x0
; GISEL12-NEXT: s_wait_kmcnt 0x0
; GISEL12-NEXT: s_or_saveexec_b32 s6, -1
; GISEL12-NEXT: s_mov_b32 s4, s0
; GISEL12-NEXT: s_mov_b32 s5, s1
; GISEL12-NEXT: s_mov_b32 s0, s3
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_and_saveexec_b32 s1, s6
; GISEL12-NEXT: s_cbranch_execz .LBB6_2
; GISEL12-NEXT: ; %bb.1: ; %shader
; GISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
; GISEL12-NEXT: flat_load_b32 v11, v[9:10]
; GISEL12-NEXT: ;;#ASMSTART
; GISEL12-NEXT: ; use v0-7
; GISEL12-NEXT: ;;#ASMEND
; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
; GISEL12-NEXT: flat_store_b32 v[9:10], v11
; GISEL12-NEXT: ; implicit-def: $vgpr9
; GISEL12-NEXT: .LBB6_2: ; %tail.block
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GISEL12-NEXT: s_mov_b32 exec_lo, s2
; GISEL12-NEXT: s_setpc_b64 s[4:5]
;
; DAGISEL12-LABEL: with_inactive_vgprs:
; DAGISEL12: ; %bb.0: ; %entry
; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
; DAGISEL12-NEXT: s_wait_expcnt 0x0
; DAGISEL12-NEXT: s_wait_samplecnt 0x0
; DAGISEL12-NEXT: s_wait_bvhcnt 0x0
; DAGISEL12-NEXT: s_wait_kmcnt 0x0
; DAGISEL12-NEXT: s_or_saveexec_b32 s6, -1
; DAGISEL12-NEXT: s_mov_b32 s5, s1
; DAGISEL12-NEXT: s_mov_b32 s4, s0
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6
; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2
; DAGISEL12-NEXT: ; %bb.1: ; %shader
; DAGISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4
; DAGISEL12-NEXT: flat_load_b32 v11, v[9:10]
; DAGISEL12-NEXT: ;;#ASMSTART
; DAGISEL12-NEXT: ; use v0-7
; DAGISEL12-NEXT: ;;#ASMEND
; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0
; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11
; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11
; DAGISEL12-NEXT: ; implicit-def: $vgpr9
; DAGISEL12-NEXT: .LBB6_2: ; %tail.block
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; DAGISEL12-NEXT: s_mov_b32 s0, s3
; DAGISEL12-NEXT: s_mov_b32 exec_lo, s2
; DAGISEL12-NEXT: s_wait_alu 0xfffe
; DAGISEL12-NEXT: s_setpc_b64 s[4:5]
;
; GISEL10-LABEL: with_inactive_vgprs:
; GISEL10: ; %bb.0: ; %entry
; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL10-NEXT: s_or_saveexec_b32 s6, -1
; GISEL10-NEXT: s_mov_b32 s4, s0
; GISEL10-NEXT: s_mov_b32 s5, s1
; GISEL10-NEXT: s_mov_b32 s0, s3
; GISEL10-NEXT: s_and_saveexec_b32 s1, s6
; GISEL10-NEXT: s_cbranch_execz .LBB6_2
; GISEL10-NEXT: ; %bb.1: ; %shader
; GISEL10-NEXT: v_mov_b32_e32 v10, s5
; GISEL10-NEXT: v_mov_b32_e32 v9, s4
; GISEL10-NEXT: flat_load_dword v11, v[9:10]
; GISEL10-NEXT: ;;#ASMSTART
; GISEL10-NEXT: ; use v0-7
; GISEL10-NEXT: ;;#ASMEND
; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
; GISEL10-NEXT: flat_store_dword v[9:10], v11
; GISEL10-NEXT: ; implicit-def: $vgpr9
; GISEL10-NEXT: .LBB6_2: ; %tail.block
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GISEL10-NEXT: s_mov_b32 exec_lo, s2
; GISEL10-NEXT: s_setpc_b64 s[4:5]
;
; DAGISEL10-LABEL: with_inactive_vgprs:
; DAGISEL10: ; %bb.0: ; %entry
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL10-NEXT: s_or_saveexec_b32 s6, -1
; DAGISEL10-NEXT: s_mov_b32 s5, s1
; DAGISEL10-NEXT: s_mov_b32 s4, s0
; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6
; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2
; DAGISEL10-NEXT: ; %bb.1: ; %shader
; DAGISEL10-NEXT: v_mov_b32_e32 v10, s5
; DAGISEL10-NEXT: v_mov_b32_e32 v9, s4
; DAGISEL10-NEXT: flat_load_dword v11, v[9:10]
; DAGISEL10-NEXT: ;;#ASMSTART
; DAGISEL10-NEXT: ; use v0-7
; DAGISEL10-NEXT: ;;#ASMEND
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11
; DAGISEL10-NEXT: flat_store_dword v[9:10], v11
; DAGISEL10-NEXT: ; implicit-def: $vgpr9
; DAGISEL10-NEXT: .LBB6_2: ; %tail.block
; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0
; DAGISEL10-NEXT: s_mov_b32 s0, s3
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2
; DAGISEL10-NEXT: s_setpc_b64 s[4:5]
entry:
%imp.def = call i32 @llvm.amdgcn.dead()
%initial.exec = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %initial.exec, label %shader, label %tail.block

shader: ; preds = %entry
%use.another.vgpr = load i32, ptr %callee ; smth that won't be moved past the inline asm
call void asm sideeffect "; use v0-7", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"()
store i32 %use.another.vgpr, ptr %callee
%active.vgpr.new = add i32 %active.vgpr, %use.another.vgpr
br label %tail.block

tail.block: ; preds = %.exit27, %.exit49, %244, %243, %entry
%active.vgpr.arg = phi i32 [ %active.vgpr, %entry ], [ %active.vgpr.new, %shader ]
%inactive.vgpr.arg = phi i32 [ %inactive.vgpr, %entry ], [ %imp.def, %shader ]
%vgprs.0 = insertvalue { i32, i32 } poison, i32 %active.vgpr.arg, 0
%vgprs = insertvalue { i32, i32 } %vgprs.0, i32 %inactive.vgpr.arg, 1
call void (ptr, i32, i32, { i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, { i32, i32} %vgprs, i32 0)
unreachable
}

declare amdgpu_gfx <16 x i32> @write_v0_v15(<16 x i32>)