AMDGPU: Define agpr versions of ds permute instructions #156695

arsenm · 2025-09-03T15:30:36Z

Correctly model these without AV_* operands. This is another
step towards removing the special casing in
TargetInstrInfo::getRegClass. Also add some tests for this.

arsenm · 2025-09-03T15:30:55Z

AMDGPU: Remove the DS special case in getRegClass #156696
AMDGPU: Define agpr versions of ds permute instructions #156695 👈 (View in Graphite)
AMDGPU: Fix definitions of DS ret atomics with AGPRs #156655
AMDGPU: Change DS classes to use RegisterOperand parameters #156580 : 1 other dependent PR (#156581 )
AMDGPU: Add agpr variants of multi-data DS instructions #156420
AMDGPU: Fix true16 d16 entry table for DS pseudos #156419
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-09-03T15:32:12Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Correctly model these without AV_* operands. This is another
step towards removing the special casing in
TargetInstrInfo::getRegClass. Also add some tests for this.

Patch is 21.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156695.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+17-4)
(added) llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll (+334)
(modified) llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s (+14-14)

diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 23dd660c3e57e..bec920380e081 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -520,6 +520,19 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
   let has_gds = 0;
 }
 
+multiclass DS_1A1D_PERMUTE_mc <string opName, SDPatternOperator node = null_frag,
+                                RegisterOperand data_op = VGPROp_32> {
+  assert OperandIsVGPR<data_op>.ret,
+         "DS with 2 data operands should be declared with VGPRs";
+  def "" : DS_1A1D_PERMUTE<opName, node, data_op>;
+
+  let SubtargetPredicate = isGFX90APlus in {
+    def _agpr : DS_1A1D_PERMUTE<opName, null_frag,
+                                getEquivalentAGPROperand<data_op>.ret>;
+  }
+}
+
+
 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
   bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
   (inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
@@ -837,10 +850,10 @@ def DS_NOP : DS_VOID<"ds_nop">;
 let SubtargetPredicate = isGFX8Plus in {
 
 let Uses = [EXEC] in {
-def DS_PERMUTE_B32  : DS_1A1D_PERMUTE <"ds_permute_b32",
-                                       int_amdgcn_ds_permute>;
-def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
-                                       int_amdgcn_ds_bpermute>;
+defm DS_PERMUTE_B32  : DS_1A1D_PERMUTE_mc<"ds_permute_b32",
+                                         int_amdgcn_ds_permute>;
+defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_bpermute_b32",
+                                         int_amdgcn_ds_bpermute>;
 }
 
 } // let SubtargetPredicate = isGFX8Plus
diff --git a/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll b/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll
new file mode 100644
index 0000000000000..5cd798d4f6db1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll
@@ -0,0 +1,334 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+; Try to stress ds.bpermute and ds.permute instructions with AGPR/AV
+; inputs. It's not permissible to mix AGPRs and VGPR data operands.
+
+define void @ds_bpermute_b32_a_a__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_a_a__use_a:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT:    ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %op0 = call i32 asm "; def $0", "=a"()
+  %op1 = call i32 asm "; def $0", "=a"()
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+  ret void
+}
+
+define void @ds_bpermute_b32_v_a__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_v_a__use_a:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT:    ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %op0 = call i32 asm "; def $0", "=v"()
+  %op1 = call i32 asm "; def $0", "=a"()
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+  ret void
+}
+
+define void @ds_bpermute_b32_a_v__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_a_v__use_a:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a0
+; CHECK-NEXT:    ds_bpermute_b32 v0, v1, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %op0 = call i32 asm "; def $0", "=a"()
+  %op1 = call i32 asm "; def $0", "=v"()
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+  ret void
+}
+
+define void @ds_bpermute_b32_a_a__use_v(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_a_a__use_v:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT:    ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %op0 = call i32 asm "; def $0", "=a"()
+  %op1 = call i32 asm "; def $0", "=a"()
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "v"(i32 %bpermute)
+  ret void
+}
+
+define void @ds_bpermute_b32_v_v__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_v_v__use_a:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %op0 = call i32 asm "; def $0", "=v"()
+  %op1 = call i32 asm "; def $0", "=v"()
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "a"(i32 %bpermute)
+  ret void
+}
+
+define void @ds_bpermute_b32_av_av__use_av(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_av_av__use_av:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=^VA"()
+  %op1 = call i32 asm "; def $0", "=^VA"()
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "^VA"(i32 %bpermute)
+  ret void
+}
+
+define i32 @ds_bpermute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_bpermute_b32_av_av_no_vgprs:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v[0:31]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:31]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT:    ds_bpermute_b32 v0, v0, v1
+; CHECK-NEXT:    v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
+  %op0 = call i32 asm sideeffect "; def $0", "=^VA"()
+  %op1 = call i32 asm sideeffect "; def $0", "=^VA"()
+  %vgpr.def = call { <32 x i32>, <32 x i32> }  asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+  %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+  %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+  %permute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+  ret i32 %permute
+}
+
+define void @ds_permute_b32_a_a__use_a(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_permute_b32_a_a__use_a:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT:    ds_permute_b32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %op0 = call i32 asm "; def $0", "=a"()
+  %op1 = call i32 asm "; def $0", "=a"()
+  %permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "a"(i32 %permute)
+  ret void
+}
+
+define void @ds_permute_b32_av_av__use_av(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_permute_b32_av_av__use_av:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ds_permute_b32 v0, v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+ %op0 = call i32 asm "; def $0", "=^VA"()
+  %op1 = call i32 asm "; def $0", "=^VA"()
+  %permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "^VA"(i32 %permute)
+  ret void
+}
+
+define i32 @ds_permute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
+; CHECK-LABEL: ds_permute_b32_av_av_no_vgprs:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v40 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a3, v41 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a4, v42 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a5, v43 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a6, v44 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a7, v45 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a8, v46 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a9, v47 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a10, v56 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a11, v57 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a12, v58 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a13, v59 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a14, v60 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a15, v61 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a16, v62 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_write_b32 a17, v63 ; Reload Reuse
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a0
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def a1
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v[0:31]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use v[0:31]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT:    ds_permute_b32 v0, v0, v1
+; CHECK-NEXT:    v_accvgpr_read_b32 v63, a17 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v62, a16 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v61, a15 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v60, a14 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v59, a13 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v58, a12 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v57, a11 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v56, a10 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v47, a9 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v46, a8 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v45, a7 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v44, a6 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v43, a5 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v42, a4 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v41, a3 ; Reload Reuse
+; CHECK-NEXT:    v_accvgpr_read_b32 v40, a2 ; Reload Reuse
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
+  %gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
+  %op0 = call i32 asm sideeffect "; def $0", "=^VA"()
+  %op1 = call i32 asm sideeffect "; def $0", "=^VA"()
+  %vgpr.def = call { <32 x i32>, <32 x i32> }  asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
+  %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
+  %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
+  %permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
+  call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
+  ret i32 %permute
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
index e8653c4681c1f..d0dc0c76fa0f3 100644
--- a/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
+++ b/llvm/test/MC/AMDGPU/gfx90a_ldst_acc.s
@@ -8282,59 +8282,59 @@ ds_swizzle_b32 a5, v1
 ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00")
 
 // GFX90A: ds_permute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_permute_b32 a5, v1, a2 offset:65535
 
 // GFX90A: ds_permute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_permute_b32 a255, v1, a2 offset:65535
 
 // GFX90A: ds_permute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_permute_b32 a5, v255, a2 offset:65535
 
 // GFX90A: ds_permute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_permute_b32 a5, v1, a255 offset:65535
 
 // GFX90A: ds_permute_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_permute_b32 a5, v1, a2
 
 // GFX90A: ds_permute_b32 a5, v1, a2       ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_permute_b32 a5, v1, a2
 
 // GFX90A: ds_permute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_permute_b32 a5, v1, a2 offset:4
 
 // GFX90A: ds_bpermute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_bpermute_b32 a5, v1, a2 offset:65535
 
 // GFX90A: ds_bpermute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_bpermute_b32 a255, v1, a2 offset:65535
 
 // GFX90A: ds_bpermute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_bpermute_b32 a5, v255, a2 offset:65535
 
 // GFX90A: ds_bpermute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05]
-// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
+// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
 ds_bpermute_b32 a5, v1, a255 of...
[truncated]

rampitec · 2025-09-03T19:24:14Z

llvm/test/CodeGen/AMDGPU/ds_permute_a_v.ll

@@ -0,0 +1,334 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5


Any chance it will actually use AGPR in codegen?

No. This needs new support in AMDGPURewriteAGPRCopyMFMA (which will also need a renaming...)

rampitec

LGTM, but it would be nice to have a codegen test with direct use of AGPR.

These are 2-data operations that need to use all-AGPR or all-VGPR inputs. Stop defining them with AVLdSt data operands, and add _agpr variants.

Correctly model these without AV_* operands. This is another step towards removing the special casing in TargetInstrInfo::getRegClass. Also add some tests for this.

This was referenced Sep 3, 2025

AMDGPU: Fix definitions of DS ret atomics with AGPRs #156655

Merged

AMDGPU: Remove the DS special case in getRegClass #156696

Merged

arsenm added the backend:AMDGPU label Sep 3, 2025 — with Graphite App

arsenm requested review from Sisyph, jayfoad, jrbyrnes, kosarev, mariusz-sikora-at-amd, mbrkusanin, rampitec, shiltian and srpande September 3, 2025 15:31

arsenm marked this pull request as ready for review September 3, 2025 15:32

rampitec reviewed Sep 3, 2025

View reviewed changes

rampitec approved these changes Sep 3, 2025

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu/add-agpr-ds-permute-insts branch from 75588de to 25ff1b6 Compare September 4, 2025 03:25

arsenm force-pushed the users/arsenm/amdgpu/ds-atomic-1d-ret-agpr-definitions branch from 90b88ef to 61b4eca Compare September 4, 2025 03:25

arsenm added 2 commits September 4, 2025 14:15

AMDGPU: Fix definitions of DS ret atomics with AGPRs

8fccb5d

These are 2-data operations that need to use all-AGPR or all-VGPR inputs. Stop defining them with AVLdSt data operands, and add _agpr variants.

AMDGPU: Define agpr versions of ds permute instructions

22f1911

Correctly model these without AV_* operands. This is another step towards removing the special casing in TargetInstrInfo::getRegClass. Also add some tests for this.

arsenm force-pushed the users/arsenm/amdgpu/ds-atomic-1d-ret-agpr-definitions branch from 61b4eca to 8fccb5d Compare September 4, 2025 05:15

arsenm force-pushed the users/arsenm/amdgpu/add-agpr-ds-permute-insts branch from 25ff1b6 to 22f1911 Compare September 4, 2025 05:15

Base automatically changed from users/arsenm/amdgpu/ds-atomic-1d-ret-agpr-definitions to main September 4, 2025 05:45

arsenm merged commit 76cb5fc into main Sep 4, 2025
14 of 15 checks passed

arsenm deleted the users/arsenm/amdgpu/add-agpr-ds-permute-insts branch September 4, 2025 06:14

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Define agpr versions of ds permute instructions #156695

AMDGPU: Define agpr versions of ds permute instructions #156695

Uh oh!

arsenm commented Sep 3, 2025

Uh oh!

arsenm commented Sep 3, 2025 •

edited

Loading

Uh oh!

llvmbot commented Sep 3, 2025

Uh oh!

rampitec Sep 3, 2025

Uh oh!

arsenm Sep 4, 2025

Uh oh!

rampitec left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

		@@ -0,0 +1,334 @@
		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5

AMDGPU: Define agpr versions of ds permute instructions #156695

AMDGPU: Define agpr versions of ds permute instructions #156695

Uh oh!

Conversation

arsenm commented Sep 3, 2025

Uh oh!

arsenm commented Sep 3, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 3, 2025

Uh oh!

rampitec Sep 3, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Sep 4, 2025

Choose a reason for hiding this comment

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

arsenm commented Sep 3, 2025 •

edited

Loading