-
Notifications
You must be signed in to change notification settings - Fork 14.9k
AMDGPU: Define agpr versions of ds permute instructions #156695
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesCorrectly model these without AV_* operands. This is another Patch is 21.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156695.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 23dd660c3e57e..bec920380e081 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -520,6 +520,19 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
let has_gds = 0;
}
+multiclass DS_1A1D_PERMUTE_mc <string opName, SDPatternOperator node = null_frag,
+ RegisterOperand data_op = VGPROp_32> {
+ assert OperandIsVGPR<data_op>.ret,
+ "DS with 2 data operands should be declared with VGPRs";
+ def "" : DS_1A1D_PERMUTE<opName, node, data_op>;
+
+ let SubtargetPredicate = isGFX90APlus in {
+ def _agpr : DS_1A1D_PERMUTE<opName, null_frag,
+ getEquivalentAGPROperand<data_op>.ret>;
+ }
+}
+
+
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
@@ -837,10 +850,10 @@ def DS_NOP : DS_VOID<"ds_nop">;
let SubtargetPredicate = isGFX8Plus in {
let Uses = [EXEC] in {
-def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
- int_amdgcn_ds_permute>;
-def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
- int_amdgcn_ds_bpermute>;
+defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_permute_b32",
+ int_amdgcn_ds_permute>;
+defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_bpermute_b32",
+ int_amdgcn_ds_bpermute>;
}
} // let SubtargetPredicate = isGFX8Plus
diff --git a/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll
new file mode 100644
index 0000000000000..5cd798d4f6db1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll
@@ -0,0 +1,334 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+; Try to stress ds.bpermute and ds.permute instructions with AGPR/AV
+; inputs. It's not permissible to mix AGPRs and VGPR data operands.
+
+define void @ds_bpermute_b32_a_a__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_a_a__use_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=a"()
+ %op1 = call i32 asm "; def $0", "=a"()
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+ ret void
+}
+
+define void @ds_bpermute_b32_v_a__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_v_a__use_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=v"()
+ %op1 = call i32 asm "; def $0", "=a"()
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+ ret void
+}
+
+define void @ds_bpermute_b32_a_v__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_a_v__use_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT: ds_bpermute_b32 v0, v1, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=a"()
+ %op1 = call i32 asm "; def $0", "=v"()
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+ ret void
+}
+
+define void @ds_bpermute_b32_a_a__use_v(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_a_a__use_v:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=a"()
+ %op1 = call i32 asm "; def $0", "=a"()
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "v"(i32 %bpermute)
+ ret void
+}
+
+define void @ds_bpermute_b32_v_v__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_v_v__use_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=v"()
+ %op1 = call i32 asm "; def $0", "=v"()
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+ ret void
+}
+
+define void @ds_bpermute_b32_av_av__use_av(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_av_av__use_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=^VA"()
+ %op1 = call i32 asm "; def $0", "=^VA"()
+ %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "^VA"(i32 %bpermute)
+ ret void
+}
+
+define i32 @ds_bpermute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_av_av_no_vgprs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
+ %op0 = call i32 asm sideeffect "; def $0", "=^VA"()
+ %op1 = call i32 asm sideeffect "; def $0", "=^VA"()
+ %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+ %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+ %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+ %permute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+ ret i32 %permute
+}
+
+define void @ds_permute_b32_a_a__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_permute_b32_a_a__use_a:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: ds_permute_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=a"()
+ %op1 = call i32 asm "; def $0", "=a"()
+ %permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "a"(i32 %permute)
+ ret void
+}
+
+define void @ds_permute_b32_av_av__use_av(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_permute_b32_av_av__use_av:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ds_permute_b32 v0, v0, v1
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=^VA"()
+ %op1 = call i32 asm "; def $0", "=^VA"()
+ %permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "^VA"(i32 %permute)
+ ret void
+}
+
+define i32 @ds_permute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_permute_b32_av_av_no_vgprs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def a1
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:31]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT: ds_permute_b32 v0, v0, v1
+; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
+ %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
+ %op0 = call i32 asm sideeffect "; def $0", "=^VA"()
+ %op1 = call i32 asm sideeffect "; def $0", "=^VA"()
+ %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+ %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+ %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+ %permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
+ call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+ ret i32 %permute
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
index e8653c4681c1f..d0dc0c76fa0f3 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -8282,59 +8282,59 @@ ds_swizzle_b32 a5, v1
ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00")
// GFX90A: ds_permute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2 offset:65535
// GFX90A: ds_permute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a255, v1, a2 offset:65535
// GFX90A: ds_permute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v255, a2 offset:65535
// GFX90A: ds_permute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a255 offset:65535
// GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2
// GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2
// GFX90A: ds_permute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2 offset:4
// GFX90A: ds_bpermute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v1, a2 offset:65535
// GFX90A: ds_bpermute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a255, v1, a2 offset:65535
// GFX90A: ds_bpermute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v255, a2 offset:65535
// GFX90A: ds_bpermute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v1, a255 of...
[truncated]
|
@@ -0,0 +1,334 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any chance it will actually use AGPR in codegen?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No. This needs new support in AMDGPURewriteAGPRCopyMFMA (which will also need a renaming...)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but it would be nice to have a codegen test with direct use of AGPR.
75588de
to
25ff1b6
Compare
90b88ef
to
61b4eca
Compare
These are 2-data operations that need to use all-AGPR or all-VGPR inputs. Stop defining them with AVLdSt data operands, and add _agpr variants.
Correctly model these without AV_* operands. This is another step towards removing the special casing in TargetInstrInfo::getRegClass. Also add some tests for this.
61b4eca
to
8fccb5d
Compare
25ff1b6
to
22f1911
Compare
Correctly model these without AV_* operands. This is another
step towards removing the special casing in
TargetInstrInfo::getRegClass. Also add some tests for this.