Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// dispatch registers are function args.
unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;

if (isShader(F.getCallingConv())) {
// Shaders that use the init.whole.wave intrinsic sometimes have VGPR
// arguments that are only added for the purpose of preserving their inactive
// lanes. Skip including them in the VGPR count.
if (isShader(F.getCallingConv()) && !MFI->hasInitWholeWave()) {
bool IsPixelShader =
F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();

Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
int32_t MaxSGPR = -1;
Info.CalleeSegmentSize = 0;

bool IsIWWFunction = MFI->hasInitWholeWave();

for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
// At this point, the chain call pseudos are already expanded.
bool IsChainCall = MI.getOpcode() == AMDGPU::SI_TCRETURN;
bool IsImplicitDef = MI.isImplicitDef();

// TODO: Check regmasks? Do they occur anywhere except calls?
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
Expand Down Expand Up @@ -239,6 +245,16 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
break;
}

// For functions that use the llvm.amdgcn.init.whole.wave intrinsic, we
// often add artificial VGPR arguments for the purpose of preserving
// their inactive lanes. These should not be reported as part of our
// VGPR usage. We can identify them easily because they're only used in
// the chain call, and possibly in an IMPLICIT_DEF coming from an
// llvm.amdgcn.dead intrinsic.
if (IsIWWFunction && (IsChainCall || IsImplicitDef) &&
TRI.isVectorRegister(MRI, Reg))
continue;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't matter if it's a vector register. I thought we already had a helper somewhere to skip synthetic uses


if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
Expand Down
76 changes: 76 additions & 0 deletions llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s

; CHECK-LABEL: .shader_functions:

; Use VGPRs above the input arguments.
; CHECK-LABEL: _miss_1:
; CHECK: .vgpr_count:{{.*}}0x1d{{$}}

define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
i32 %vcr, { i32 } %system.data,
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
local_unnamed_addr {
entry:
%system.data.value = extractvalue { i32 } %system.data, 0
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %shader, label %tail

shader:
%system.data.extract = extractvalue { i32 } %system.data, 0
%data.mul = mul i32 %system.data.extract, 2
%data.add = add i32 %data.mul, 1
call void asm sideeffect "; clobber v28", "~{v28}"()
br label %tail

tail:
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]

%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11

%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3

call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
unreachable
}

declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)

declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

!amdgpu.pal.metadata.msgpack = !{!0}

!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
50 changes: 50 additions & 0 deletions llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s

; CHECK-LABEL: .shader_functions:

; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
; CHECK-LABEL: leaf_shader:
; CHECK: .vgpr_count:{{.*}}0xc{{$}}

; Function without calls.
define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
i32 %active.vgpr1, i32 %active.vgpr2,
i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
local_unnamed_addr {
entry:
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %compute, label %merge

compute:
; Perform a more complex computation using active VGPRs
%square = mul i32 %active.vgpr1, %active.vgpr1
%product = mul i32 %square, %active.vgpr2
%sum = add i32 %product, %input.value
%result = add i32 %sum, 42
br label %merge

merge:
%final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]

store i32 %final.result, ptr %output.ptr, align 4

ret void
}

declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)

declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

!amdgpu.pal.metadata.msgpack = !{!0}

!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s

; CHECK-LABEL: .shader_functions:

; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
; The shader is free to use any of the VGPRs mapped to a %inactive.vpgr as long as it only touches its active lanes.
; In that case, the VGPR should be included in the .vgpr_count
; CHECK-LABEL: _miss_1:
; CHECK: .vgpr_count:{{.*}}0xd{{$}}

define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
i32 %vcr, { i32 } %system.data,
i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
i32 %inactive.vgpr8, i32 %inactive.vgpr9)
local_unnamed_addr {
entry:
%system.data.value = extractvalue { i32 } %system.data, 0
%dead.val = call i32 @llvm.amdgcn.dead.i32()
%is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
br i1 %is.whole.wave, label %shader, label %tail

shader:
%system.data.extract = extractvalue { i32 } %system.data, 0
%data.mul = mul i32 %system.data.extract, 2
%data.add = add i32 %data.mul, 1
call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
br label %tail

tail:
%final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
%final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
%final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
%final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
%final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
%final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
%final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
%final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
%final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
%final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
%final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
%final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]

%struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
%struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
%struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
%struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
%struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
%struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
%struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
%struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
%struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
%struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
%struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
%final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11

%vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
%vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
%vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
%final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3

call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
@llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
unreachable
}

declare i32 @llvm.amdgcn.dead.i32()
declare i1 @llvm.amdgcn.init.whole.wave()
declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)

declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

!amdgpu.pal.metadata.msgpack = !{!0}

!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
Loading