diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index eb7bde6999491..11037e80aa8b1 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3450,4 +3450,11 @@ def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic< [llvm_anyptr_ty], [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable] >; + +/// Make it clear to the backend that this value is really dead. For instance, +/// when used as an input to a phi node, it will make it possible for the +/// backend to allocate the dead lanes for operations within the corresponding +/// incoming block. +def int_amdgcn_dead: DefaultAttrsIntrinsic<[llvm_any_ty], [], + [IntrNoMem, IntrWillReturn, IntrNoCallback]>; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3bbbbcf71d8ae..26d0cb85a984f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1190,6 +1190,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { case Intrinsic::amdgcn_permlane16_swap: case Intrinsic::amdgcn_permlane32_swap: return selectPermlaneSwapIntrin(I, IntrinsicID); + case Intrinsic::amdgcn_dead: { + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + I.removeOperand(1); // drop intrinsic ID + return RBI.constrainGenericRegister(I.getOperand(0).getReg(), + AMDGPU::VGPR_32RegClass, *MRI); + } default: return selectImpl(I, *CoverageInfo); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 2e5f42c3bdc40..2693ad3894cca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4676,6 +4676,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_set_inactive_chain_arg: case Intrinsic::amdgcn_permlane64: case Intrinsic::amdgcn_ds_bpermute_fi_b32: + case Intrinsic::amdgcn_dead: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_cvt_pkrtz: if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 10175557fadc7..3b62dcf3c92cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -362,6 +362,8 @@ def : SourceOfDivergence; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; +def : SourceOfDivergence; + class AlwaysUniform { Intrinsic Intr = intr; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6e08aff24ec23..5c32d69054e73 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -4268,3 +4268,9 @@ def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { let hasSideEffects = 1; let SubtargetPredicate = isGFX10Plus; } + +// FIXME: Would be nice if we could set the register class for the destination +// register too. +def IMP_DEF_FROM_INTRINSIC: Pat< + (i32 (int_amdgcn_dead)), (IMPLICIT_DEF)>; + diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index aa5208560817f..bb840023daf5d 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -520,7 +520,12 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i ret void } - +; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.dead.i32() +define amdgpu_cs_chain void @dead(ptr addrspace(1) %out) { + %v = call i32 @llvm.amdgcn.dead.i32() + store i32 %v, ptr addrspace(1) %out + ret void +} declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1 declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1 @@ -558,5 +563,7 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) declare <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1)) declare <4 x bfloat> @llvm.amdgcn.global.load.tr.b128.v4bf16(ptr addrspace(1)) +declare i32 @llvm.amdgcn.dead.i32() + attributes #0 = { nounwind convergent } attributes #1 = { nounwind readnone convergent } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll new file mode 100644 index 0000000000000..a009854542f21 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-DAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=ASM-GISEL %s + +; Test that we can use v0 for temporaries in the if.then block. +define i32 @dead(i1 %cond, i32 %x, ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2) #0 { +; ASM-DAG-LABEL: dead: +; ASM-DAG: ; %bb.0: ; %entry +; ASM-DAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-DAG-NEXT: s_wait_expcnt 0x0 +; ASM-DAG-NEXT: s_wait_samplecnt 0x0 +; ASM-DAG-NEXT: s_wait_bvhcnt 0x0 +; ASM-DAG-NEXT: s_wait_kmcnt 0x0 +; ASM-DAG-NEXT: v_mov_b32_e32 v4, v0 +; ASM-DAG-NEXT: v_mov_b32_e32 v0, v1 +; ASM-DAG-NEXT: s_mov_b32 s0, exec_lo +; ASM-DAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; ASM-DAG-NEXT: v_and_b32_e32 v1, 1, v4 +; ASM-DAG-NEXT: v_cmpx_eq_u32_e32 1, v1 +; ASM-DAG-NEXT: s_cbranch_execz .LBB0_2 +; ASM-DAG-NEXT: ; %bb.1: ; %if.then +; ASM-DAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; ASM-DAG-NEXT: global_store_b32 v[2:3], v0, off +; ASM-DAG-NEXT: ; implicit-def: $vgpr0 +; ASM-DAG-NEXT: .LBB0_2: ; %if.end +; ASM-DAG-NEXT: s_wait_alu 0xfffe +; ASM-DAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-DAG-NEXT: s_setpc_b64 s[30:31] +; +; ASM-GISEL-LABEL: dead: +; ASM-GISEL: ; %bb.0: ; %entry +; ASM-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; ASM-GISEL-NEXT: s_wait_expcnt 0x0 +; ASM-GISEL-NEXT: s_wait_samplecnt 0x0 +; ASM-GISEL-NEXT: s_wait_bvhcnt 0x0 +; ASM-GISEL-NEXT: s_wait_kmcnt 0x0 +; ASM-GISEL-NEXT: v_mov_b32_e32 v4, v0 +; ASM-GISEL-NEXT: v_mov_b32_e32 v0, v1 +; ASM-GISEL-NEXT: s_mov_b32 s0, exec_lo +; ASM-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; ASM-GISEL-NEXT: v_and_b32_e32 v1, 1, v4 +; ASM-GISEL-NEXT: v_cmpx_ne_u32_e32 0, v1 +; ASM-GISEL-NEXT: s_cbranch_execz .LBB0_2 +; ASM-GISEL-NEXT: ; %bb.1: ; %if.then +; ASM-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; ASM-GISEL-NEXT: global_store_b32 v[2:3], v0, off +; ASM-GISEL-NEXT: ; implicit-def: $vgpr0 +; ASM-GISEL-NEXT: .LBB0_2: ; %if.end +; ASM-GISEL-NEXT: s_wait_alu 0xfffe +; ASM-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; ASM-GISEL-NEXT: s_setpc_b64 s[30:31] +entry: + %dead = call i32 @llvm.amdgcn.dead.i32() + br i1 %cond, label %if.then, label %if.end + +if.then: ; preds = %entry + %temp = add i32 %x, 1 + store i32 %temp, ptr addrspace(1) %ptr1 + br label %if.end + +if.end: + %res = phi i32 [ %x, %entry ], [ %dead, %if.then ] + ret i32 %res +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index 1bdaa4c98127d..110192ecefe55 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -1115,4 +1115,141 @@ tail: unreachable } +; Since functions that contain amdgcn.init.whole.wave do not preserve the inactive +; lanes of any VGPRs, the middle end will explicitly preserve them if needed by adding +; dummy VGPR arguments. Since only the inactive lanes are important, we need to make +; it clear to the backend that it's safe to allocate v9's active lanes inside +; shader. This is achieved by using the llvm.amdgcn.dead intrinsic. +define amdgpu_cs_chain void @with_inactive_vgprs(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, i32 %active.vgpr, i32 %inactive.vgpr) { +; GISEL12-LABEL: with_inactive_vgprs: +; GISEL12: ; %bb.0: ; %entry +; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL12-NEXT: s_wait_expcnt 0x0 +; GISEL12-NEXT: s_wait_samplecnt 0x0 +; GISEL12-NEXT: s_wait_bvhcnt 0x0 +; GISEL12-NEXT: s_wait_kmcnt 0x0 +; GISEL12-NEXT: s_or_saveexec_b32 s6, -1 +; GISEL12-NEXT: s_mov_b32 s4, s0 +; GISEL12-NEXT: s_mov_b32 s5, s1 +; GISEL12-NEXT: s_mov_b32 s0, s3 +; GISEL12-NEXT: s_wait_alu 0xfffe +; GISEL12-NEXT: s_and_saveexec_b32 s1, s6 +; GISEL12-NEXT: s_cbranch_execz .LBB6_2 +; GISEL12-NEXT: ; %bb.1: ; %shader +; GISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4 +; GISEL12-NEXT: flat_load_b32 v11, v[9:10] +; GISEL12-NEXT: ;;#ASMSTART +; GISEL12-NEXT: ; use v0-7 +; GISEL12-NEXT: ;;#ASMEND +; GISEL12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GISEL12-NEXT: flat_store_b32 v[9:10], v11 +; GISEL12-NEXT: ; implicit-def: $vgpr9 +; GISEL12-NEXT: .LBB6_2: ; %tail.block +; GISEL12-NEXT: s_wait_alu 0xfffe +; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL12-NEXT: s_mov_b32 exec_lo, s2 +; GISEL12-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL12-LABEL: with_inactive_vgprs: +; DAGISEL12: ; %bb.0: ; %entry +; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL12-NEXT: s_wait_expcnt 0x0 +; DAGISEL12-NEXT: s_wait_samplecnt 0x0 +; DAGISEL12-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL12-NEXT: s_wait_kmcnt 0x0 +; DAGISEL12-NEXT: s_or_saveexec_b32 s6, -1 +; DAGISEL12-NEXT: s_mov_b32 s5, s1 +; DAGISEL12-NEXT: s_mov_b32 s4, s0 +; DAGISEL12-NEXT: s_wait_alu 0xfffe +; DAGISEL12-NEXT: s_and_saveexec_b32 s0, s6 +; DAGISEL12-NEXT: s_cbranch_execz .LBB6_2 +; DAGISEL12-NEXT: ; %bb.1: ; %shader +; DAGISEL12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4 +; DAGISEL12-NEXT: flat_load_b32 v11, v[9:10] +; DAGISEL12-NEXT: ;;#ASMSTART +; DAGISEL12-NEXT: ; use v0-7 +; DAGISEL12-NEXT: ;;#ASMEND +; DAGISEL12-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL12-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; DAGISEL12-NEXT: flat_store_b32 v[9:10], v11 +; DAGISEL12-NEXT: ; implicit-def: $vgpr9 +; DAGISEL12-NEXT: .LBB6_2: ; %tail.block +; DAGISEL12-NEXT: s_wait_alu 0xfffe +; DAGISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; DAGISEL12-NEXT: s_mov_b32 s0, s3 +; DAGISEL12-NEXT: s_mov_b32 exec_lo, s2 +; DAGISEL12-NEXT: s_wait_alu 0xfffe +; DAGISEL12-NEXT: s_setpc_b64 s[4:5] +; +; GISEL10-LABEL: with_inactive_vgprs: +; GISEL10: ; %bb.0: ; %entry +; GISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL10-NEXT: s_or_saveexec_b32 s6, -1 +; GISEL10-NEXT: s_mov_b32 s4, s0 +; GISEL10-NEXT: s_mov_b32 s5, s1 +; GISEL10-NEXT: s_mov_b32 s0, s3 +; GISEL10-NEXT: s_and_saveexec_b32 s1, s6 +; GISEL10-NEXT: s_cbranch_execz .LBB6_2 +; GISEL10-NEXT: ; %bb.1: ; %shader +; GISEL10-NEXT: v_mov_b32_e32 v10, s5 +; GISEL10-NEXT: v_mov_b32_e32 v9, s4 +; GISEL10-NEXT: flat_load_dword v11, v[9:10] +; GISEL10-NEXT: ;;#ASMSTART +; GISEL10-NEXT: ; use v0-7 +; GISEL10-NEXT: ;;#ASMEND +; GISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; GISEL10-NEXT: flat_store_dword v[9:10], v11 +; GISEL10-NEXT: ; implicit-def: $vgpr9 +; GISEL10-NEXT: .LBB6_2: ; %tail.block +; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL10-NEXT: s_mov_b32 exec_lo, s2 +; GISEL10-NEXT: s_setpc_b64 s[4:5] +; +; DAGISEL10-LABEL: with_inactive_vgprs: +; DAGISEL10: ; %bb.0: ; %entry +; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL10-NEXT: s_or_saveexec_b32 s6, -1 +; DAGISEL10-NEXT: s_mov_b32 s5, s1 +; DAGISEL10-NEXT: s_mov_b32 s4, s0 +; DAGISEL10-NEXT: s_and_saveexec_b32 s0, s6 +; DAGISEL10-NEXT: s_cbranch_execz .LBB6_2 +; DAGISEL10-NEXT: ; %bb.1: ; %shader +; DAGISEL10-NEXT: v_mov_b32_e32 v10, s5 +; DAGISEL10-NEXT: v_mov_b32_e32 v9, s4 +; DAGISEL10-NEXT: flat_load_dword v11, v[9:10] +; DAGISEL10-NEXT: ;;#ASMSTART +; DAGISEL10-NEXT: ; use v0-7 +; DAGISEL10-NEXT: ;;#ASMEND +; DAGISEL10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; DAGISEL10-NEXT: v_add_nc_u32_e32 v8, v8, v11 +; DAGISEL10-NEXT: flat_store_dword v[9:10], v11 +; DAGISEL10-NEXT: ; implicit-def: $vgpr9 +; DAGISEL10-NEXT: .LBB6_2: ; %tail.block +; DAGISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; DAGISEL10-NEXT: s_mov_b32 s0, s3 +; DAGISEL10-NEXT: s_mov_b32 exec_lo, s2 +; DAGISEL10-NEXT: s_setpc_b64 s[4:5] +entry: + %imp.def = call i32 @llvm.amdgcn.dead() + %initial.exec = call i1 @llvm.amdgcn.init.whole.wave() + br i1 %initial.exec, label %shader, label %tail.block + +shader: ; preds = %entry + %use.another.vgpr = load i32, ptr %callee ; smth that won't be moved past the inline asm + call void asm sideeffect "; use v0-7", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() + store i32 %use.another.vgpr, ptr %callee + %active.vgpr.new = add i32 %active.vgpr, %use.another.vgpr + br label %tail.block + +tail.block: ; preds = %.exit27, %.exit49, %244, %243, %entry + %active.vgpr.arg = phi i32 [ %active.vgpr, %entry ], [ %active.vgpr.new, %shader ] + %inactive.vgpr.arg = phi i32 [ %inactive.vgpr, %entry ], [ %imp.def, %shader ] + %vgprs.0 = insertvalue { i32, i32 } poison, i32 %active.vgpr.arg, 0 + %vgprs = insertvalue { i32, i32 } %vgprs.0, i32 %inactive.vgpr.arg, 1 + call void (ptr, i32, i32, { i32, i32 }, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.sl_i32i32(ptr inreg %callee, i32 inreg %exec, i32 inreg %sgpr, { i32, i32} %vgprs, i32 0) + unreachable +} + declare amdgpu_gfx <16 x i32> @write_v0_v15(<16 x i32>)