-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Track minNumAGPRs in MFI instead of mayUseAGPRs #161996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPreviously we were getting lucky on cases that can use AV registers I do not understand what the check against getAddressableNumArchVGPRs Full diff: https://github.com/llvm/llvm-project/pull/161996.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 908d856d386f5..ec4e4f9ea9a3a 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -85,9 +85,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (ST.hasGFX90AInsts()) {
// FIXME: MayNeedAGPRs is a misnomer for how this is used. MFMA selection
// should be separated from availability of AGPRs
- if (MFMAVGPRForm ||
- (ST.getMaxNumVGPRs(F) <= ST.getAddressableNumArchVGPRs() &&
- !mayUseAGPRs(F)))
+ if (!mayUseAGPRs(F))
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
}
diff --git a/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll b/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll
new file mode 100644
index 0000000000000..ba0fdc689b4ff
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/smfmac_alloc_failure_no_agpr_O0.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=1 < %s | FileCheck %s
+
+declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half>, <16 x float>, i32, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
+; CHECK-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_mov_b64 s[2:3], s[4:5]
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x34
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_load_dword s2, s[2:3], 0x64
+; CHECK-NEXT: s_mov_b32 s3, 0x3ff
+; CHECK-NEXT: v_and_b32_e64 v1, v1, s3
+; CHECK-NEXT: s_mov_b32 s3, 6
+; CHECK-NEXT: v_lshlrev_b32_e64 v8, s3, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, v7
+; CHECK-NEXT: v_mov_b32_e32 v2, v6
+; CHECK-NEXT: v_mov_b32_e32 v3, v5
+; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v5, v13
+; CHECK-NEXT: v_mov_b32_e32 v6, v12
+; CHECK-NEXT: v_mov_b32_e32 v7, v11
+; CHECK-NEXT: v_mov_b32_e32 v24, v10
+; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v25, v13
+; CHECK-NEXT: v_mov_b32_e32 v26, v12
+; CHECK-NEXT: v_mov_b32_e32 v27, v11
+; CHECK-NEXT: v_mov_b32_e32 v28, v10
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v8, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v29, v11
+; CHECK-NEXT: v_mov_b32_e32 v30, v10
+; CHECK-NEXT: v_mov_b32_e32 v31, v9
+; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v9, v31
+; CHECK-NEXT: v_mov_b32_e32 v10, v30
+; CHECK-NEXT: v_mov_b32_e32 v11, v29
+; CHECK-NEXT: v_mov_b32_e32 v12, v28
+; CHECK-NEXT: v_mov_b32_e32 v13, v27
+; CHECK-NEXT: v_mov_b32_e32 v14, v26
+; CHECK-NEXT: v_mov_b32_e32 v15, v25
+; CHECK-NEXT: v_mov_b32_e32 v16, v24
+; CHECK-NEXT: v_mov_b32_e32 v17, v7
+; CHECK-NEXT: v_mov_b32_e32 v18, v6
+; CHECK-NEXT: v_mov_b32_e32 v19, v5
+; CHECK-NEXT: v_mov_b32_e32 v20, v4
+; CHECK-NEXT: v_mov_b32_e32 v21, v3
+; CHECK-NEXT: v_mov_b32_e32 v22, v2
+; CHECK-NEXT: v_mov_b32_e32 v23, v1
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
+; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
+; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11]
+; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[8:9]
+; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[6:7]
+; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[4:5]
+; CHECK-NEXT: v_mov_b32_e32 v1, s2
+; CHECK-NEXT: s_nop 1
+; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 v[8:23], v[2:5], v[24:31], v1 cbsz:1 abid:2
+; CHECK-NEXT: s_nop 11
+; CHECK-NEXT: v_mov_b32_e32 v1, v23
+; CHECK-NEXT: v_mov_b32_e32 v6, v22
+; CHECK-NEXT: v_mov_b32_e32 v7, v21
+; CHECK-NEXT: v_mov_b32_e32 v2, v20
+; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v3, v7
+; CHECK-NEXT: v_mov_b32_e32 v4, v6
+; CHECK-NEXT: v_mov_b32_e32 v5, v1
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
+; CHECK-NEXT: v_mov_b32_e32 v1, v19
+; CHECK-NEXT: v_mov_b32_e32 v6, v18
+; CHECK-NEXT: v_mov_b32_e32 v7, v17
+; CHECK-NEXT: v_mov_b32_e32 v2, v16
+; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v3, v7
+; CHECK-NEXT: v_mov_b32_e32 v4, v6
+; CHECK-NEXT: v_mov_b32_e32 v5, v1
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32
+; CHECK-NEXT: v_mov_b32_e32 v1, v15
+; CHECK-NEXT: v_mov_b32_e32 v6, v14
+; CHECK-NEXT: v_mov_b32_e32 v7, v13
+; CHECK-NEXT: v_mov_b32_e32 v2, v12
+; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v3, v7
+; CHECK-NEXT: v_mov_b32_e32 v4, v6
+; CHECK-NEXT: v_mov_b32_e32 v5, v1
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16
+; CHECK-NEXT: v_mov_b32_e32 v1, v11
+; CHECK-NEXT: v_mov_b32_e32 v6, v10
+; CHECK-NEXT: v_mov_b32_e32 v7, v9
+; CHECK-NEXT: v_mov_b32_e32 v2, v8
+; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec
+; CHECK-NEXT: v_mov_b32_e32 v3, v7
+; CHECK-NEXT: v_mov_b32_e32 v4, v6
+; CHECK-NEXT: v_mov_b32_e32 v5, v1
+; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; CHECK-NEXT: s_endpgm
+bb:
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
+ %in.1 = load <16 x float>, ptr addrspace(1) %gep
+ %mai.1 = tail call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %a, <16 x half> %b, <16 x float> %in.1, i32 %idx, i32 1, i32 2)
+ store <16 x float> %mai.1, ptr addrspace(1) %arg
+ ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" }
|
Previously we were getting lucky on cases that can use AV registers with the normal optimization pipeline. I do not understand what the check against getAddressableNumArchVGPRs was doing here. This logic needs to be consistent with getMaxNumVectorRegs, as that is what getReservedRegs to determine the AGPR budget. In the future we should directly check the minimum AGPR budget, and individual selection patterns need to know the minimum budget required for them.
Start accounting for the number of AGPRs required to perform the allocation. Refine the selection predicates to check this number is available, and default to selecting the VGPR case if there aren't enough. This avoids register allocation failures for the largest MFMAs with the default register budget.
04bb381
to
764d91d
Compare
Fix mfma agpr allocation failures with -O0. Previously we were getting lucky
on cases that can use AV registers with the normal optimization pipeline.
This logic needs to be consistent with getMaxNumVectorRegs,
as that is what getReservedRegs to determine the AGPR budget. In the
future we should directly check the minimum AGPR budget, and individual
selection patterns need to know the minimum budget required for them.
Start accounting for the number of AGPRs required to perform the
allocation. Refine the selection predicates to check this number is
available, and default to selecting the VGPR case if there aren't
enough. This also avoids register allocation failures for the largest
MFMAs with the default register budget.