Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4255,10 +4255,9 @@ same *vendor-name*.
wavefront for
GFX6-GFX9. A register
is required if it is
used explicitly, or
written to, or
if a higher numbered
register is used
explicitly. This
register is written to. This
includes the special
SGPRs for VCC, Flat
Scratch (GFX7-GFX9)
Expand All @@ -4276,10 +4275,10 @@ same *vendor-name*.
each work-item for
GFX6-GFX9. A register
is required if it is
used explicitly, or
written to, or
if a higher numbered
register is used
explicitly.
register is
written to.
".agpr_count" integer Required Number of accumulator
registers required by
each work-item for
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;

if (isShader(F.getCallingConv())) {
// Shaders that use the init.whole.wave intrinsic sometimes have VGPR
// arguments that are only added for the purpose of preserving their inactive
// lanes. Skip including them in the VGPR count.
if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();

Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,16 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(

for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
if (MI.isImplicitDef())
continue;

// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
bool IsAGPR = false;

if (!MO.isReg())
if (!MO.isReg() || MO.isUse())
continue;

Register Reg = MO.getReg();
Expand Down
76 changes: 76 additions & 0 deletions llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s

; CHECK-LABEL: .shader_functions:

; Use VGPRs above the input arguments.
; CHECK-LABEL: _miss_1:
; CHECK: .vgpr_count:{{.*}}0x1d{{$}}

define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
i32 %vcr, { i32 } %system.data,
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
local_unnamed_addr {
entry:
%system.data.value = extractvalue { i32 } %system.data, 0
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %shader, label %tail

shader:
%system.data.extract = extractvalue { i32 } %system.data, 0
%data.mul = mul i32 %system.data.extract, 2
%data.add = add i32 %data.mul, 1
call void asm sideeffect "; clobber v28", "~{v28}"()
br label %tail

tail:
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]

%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11

%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3

call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
unreachable
}

declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)

declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

!amdgpu.pal.metadata.msgpack = !{!0}

!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
50 changes: 50 additions & 0 deletions llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s

; CHECK-LABEL: .shader_functions:

; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
; CHECK-LABEL: leaf_shader:
; CHECK: .vgpr_count:{{.*}}0xc{{$}}

; Function without calls.
define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
i32 %active.vgpr1, i32 %active.vgpr2,
i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
local_unnamed_addr {
entry:
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %compute, label %merge

compute:
; Perform a more complex computation using active VGPRs
%square = mul i32 %active.vgpr1, %active.vgpr1
%product = mul i32 %square, %active.vgpr2
%sum = add i32 %product, %input.value
%result = add i32 %sum, 42
br label %merge

merge:
%final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]

store i32 %final.result, ptr %output.ptr, align 4

ret void
}

declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)

declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

!amdgpu.pal.metadata.msgpack = !{!0}

!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s

; CHECK-LABEL: .shader_functions:

; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
; The shader is free to use any of the VGPRs mapped to a %inactive.vpgr as long as it only touches its active lanes.
; In that case, the VGPR should be included in the .vgpr_count
; CHECK-LABEL: _miss_1:
; CHECK: .vgpr_count:{{.*}}0xd{{$}}

define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
i32 %vcr, { i32 } %system.data,
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
local_unnamed_addr {
entry:
%system.data.value = extractvalue { i32 } %system.data, 0
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %shader, label %tail

shader:
%system.data.extract = extractvalue { i32 } %system.data, 0
%data.mul = mul i32 %system.data.extract, 2
%data.add = add i32 %data.mul, 1
call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
br label %tail

tail:
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]

%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11

%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3

call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
unreachable
}

declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)

declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

!amdgpu.pal.metadata.msgpack = !{!0}

!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
Loading
Loading